1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __XMMINTRIN_H 11 #define __XMMINTRIN_H 12 13 #include <mmintrin.h> 14 15 typedef int __v4si __attribute__((__vector_size__(16))); 16 typedef float __v4sf __attribute__((__vector_size__(16))); 17 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); 18 19 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); 20 21 /* Unsigned types */ 22 typedef unsigned int __v4su __attribute__((__vector_size__(16))); 23 24 /* This header should only be included in a hosted environment as it depends on 25 * a standard library to provide allocation routines. */ 26 #if __STDC_HOSTED__ 27 #include <mm_malloc.h> 28 #endif 29 30 /* Define the default attributes for the functions in this file. */ 31 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) 32 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) 33 34 /// Adds the 32-bit float values in the low-order bits of the operands. 35 /// 36 /// \headerfile <x86intrin.h> 37 /// 38 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions. 39 /// 40 /// \param __a 41 /// A 128-bit vector of [4 x float] containing one of the source operands. 42 /// The lower 32 bits of this operand are used in the calculation. 43 /// \param __b 44 /// A 128-bit vector of [4 x float] containing one of the source operands. 45 /// The lower 32 bits of this operand are used in the calculation. 46 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 47 /// of the lower 32 bits of both operands. The upper 96 bits are copied from 48 /// the upper 96 bits of the first source operand. 49 static __inline__ __m128 __DEFAULT_FN_ATTRS 50 _mm_add_ss(__m128 __a, __m128 __b) 51 { 52 __a[0] += __b[0]; 53 return __a; 54 } 55 56 /// Adds two 128-bit vectors of [4 x float], and returns the results of 57 /// the addition. 58 /// 59 /// \headerfile <x86intrin.h> 60 /// 61 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions. 62 /// 63 /// \param __a 64 /// A 128-bit vector of [4 x float] containing one of the source operands. 65 /// \param __b 66 /// A 128-bit vector of [4 x float] containing one of the source operands. 67 /// \returns A 128-bit vector of [4 x float] containing the sums of both 68 /// operands. 69 static __inline__ __m128 __DEFAULT_FN_ATTRS 70 _mm_add_ps(__m128 __a, __m128 __b) 71 { 72 return (__m128)((__v4sf)__a + (__v4sf)__b); 73 } 74 75 /// Subtracts the 32-bit float value in the low-order bits of the second 76 /// operand from the corresponding value in the first operand. 77 /// 78 /// \headerfile <x86intrin.h> 79 /// 80 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions. 81 /// 82 /// \param __a 83 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits 84 /// of this operand are used in the calculation. 85 /// \param __b 86 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 87 /// bits of this operand are used in the calculation. 88 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 89 /// difference of the lower 32 bits of both operands. The upper 96 bits are 90 /// copied from the upper 96 bits of the first source operand. 91 static __inline__ __m128 __DEFAULT_FN_ATTRS 92 _mm_sub_ss(__m128 __a, __m128 __b) 93 { 94 __a[0] -= __b[0]; 95 return __a; 96 } 97 98 /// Subtracts each of the values of the second operand from the first 99 /// operand, both of which are 128-bit vectors of [4 x float] and returns 100 /// the results of the subtraction. 101 /// 102 /// \headerfile <x86intrin.h> 103 /// 104 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions. 105 /// 106 /// \param __a 107 /// A 128-bit vector of [4 x float] containing the minuend. 108 /// \param __b 109 /// A 128-bit vector of [4 x float] containing the subtrahend. 110 /// \returns A 128-bit vector of [4 x float] containing the differences between 111 /// both operands. 112 static __inline__ __m128 __DEFAULT_FN_ATTRS 113 _mm_sub_ps(__m128 __a, __m128 __b) 114 { 115 return (__m128)((__v4sf)__a - (__v4sf)__b); 116 } 117 118 /// Multiplies two 32-bit float values in the low-order bits of the 119 /// operands. 120 /// 121 /// \headerfile <x86intrin.h> 122 /// 123 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions. 124 /// 125 /// \param __a 126 /// A 128-bit vector of [4 x float] containing one of the source operands. 127 /// The lower 32 bits of this operand are used in the calculation. 128 /// \param __b 129 /// A 128-bit vector of [4 x float] containing one of the source operands. 130 /// The lower 32 bits of this operand are used in the calculation. 131 /// \returns A 128-bit vector of [4 x float] containing the product of the lower 132 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96 133 /// bits of the first source operand. 134 static __inline__ __m128 __DEFAULT_FN_ATTRS 135 _mm_mul_ss(__m128 __a, __m128 __b) 136 { 137 __a[0] *= __b[0]; 138 return __a; 139 } 140 141 /// Multiplies two 128-bit vectors of [4 x float] and returns the 142 /// results of the multiplication. 143 /// 144 /// \headerfile <x86intrin.h> 145 /// 146 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions. 147 /// 148 /// \param __a 149 /// A 128-bit vector of [4 x float] containing one of the source operands. 150 /// \param __b 151 /// A 128-bit vector of [4 x float] containing one of the source operands. 152 /// \returns A 128-bit vector of [4 x float] containing the products of both 153 /// operands. 154 static __inline__ __m128 __DEFAULT_FN_ATTRS 155 _mm_mul_ps(__m128 __a, __m128 __b) 156 { 157 return (__m128)((__v4sf)__a * (__v4sf)__b); 158 } 159 160 /// Divides the value in the low-order 32 bits of the first operand by 161 /// the corresponding value in the second operand. 162 /// 163 /// \headerfile <x86intrin.h> 164 /// 165 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions. 166 /// 167 /// \param __a 168 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32 169 /// bits of this operand are used in the calculation. 170 /// \param __b 171 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits 172 /// of this operand are used in the calculation. 173 /// \returns A 128-bit vector of [4 x float] containing the quotients of the 174 /// lower 32 bits of both operands. The upper 96 bits are copied from the 175 /// upper 96 bits of the first source operand. 176 static __inline__ __m128 __DEFAULT_FN_ATTRS 177 _mm_div_ss(__m128 __a, __m128 __b) 178 { 179 __a[0] /= __b[0]; 180 return __a; 181 } 182 183 /// Divides two 128-bit vectors of [4 x float]. 184 /// 185 /// \headerfile <x86intrin.h> 186 /// 187 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions. 188 /// 189 /// \param __a 190 /// A 128-bit vector of [4 x float] containing the dividend. 191 /// \param __b 192 /// A 128-bit vector of [4 x float] containing the divisor. 193 /// \returns A 128-bit vector of [4 x float] containing the quotients of both 194 /// operands. 195 static __inline__ __m128 __DEFAULT_FN_ATTRS 196 _mm_div_ps(__m128 __a, __m128 __b) 197 { 198 return (__m128)((__v4sf)__a / (__v4sf)__b); 199 } 200 201 /// Calculates the square root of the value stored in the low-order bits 202 /// of a 128-bit vector of [4 x float]. 203 /// 204 /// \headerfile <x86intrin.h> 205 /// 206 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions. 207 /// 208 /// \param __a 209 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 210 /// used in the calculation. 211 /// \returns A 128-bit vector of [4 x float] containing the square root of the 212 /// value in the low-order bits of the operand. 213 static __inline__ __m128 __DEFAULT_FN_ATTRS 214 _mm_sqrt_ss(__m128 __a) 215 { 216 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); 217 } 218 219 /// Calculates the square roots of the values stored in a 128-bit vector 220 /// of [4 x float]. 221 /// 222 /// \headerfile <x86intrin.h> 223 /// 224 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions. 225 /// 226 /// \param __a 227 /// A 128-bit vector of [4 x float]. 228 /// \returns A 128-bit vector of [4 x float] containing the square roots of the 229 /// values in the operand. 230 static __inline__ __m128 __DEFAULT_FN_ATTRS 231 _mm_sqrt_ps(__m128 __a) 232 { 233 return __builtin_ia32_sqrtps((__v4sf)__a); 234 } 235 236 /// Calculates the approximate reciprocal of the value stored in the 237 /// low-order bits of a 128-bit vector of [4 x float]. 238 /// 239 /// \headerfile <x86intrin.h> 240 /// 241 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions. 242 /// 243 /// \param __a 244 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 245 /// used in the calculation. 246 /// \returns A 128-bit vector of [4 x float] containing the approximate 247 /// reciprocal of the value in the low-order bits of the operand. 248 static __inline__ __m128 __DEFAULT_FN_ATTRS 249 _mm_rcp_ss(__m128 __a) 250 { 251 return (__m128)__builtin_ia32_rcpss((__v4sf)__a); 252 } 253 254 /// Calculates the approximate reciprocals of the values stored in a 255 /// 128-bit vector of [4 x float]. 256 /// 257 /// \headerfile <x86intrin.h> 258 /// 259 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions. 260 /// 261 /// \param __a 262 /// A 128-bit vector of [4 x float]. 263 /// \returns A 128-bit vector of [4 x float] containing the approximate 264 /// reciprocals of the values in the operand. 265 static __inline__ __m128 __DEFAULT_FN_ATTRS 266 _mm_rcp_ps(__m128 __a) 267 { 268 return (__m128)__builtin_ia32_rcpps((__v4sf)__a); 269 } 270 271 /// Calculates the approximate reciprocal of the square root of the value 272 /// stored in the low-order bits of a 128-bit vector of [4 x float]. 273 /// 274 /// \headerfile <x86intrin.h> 275 /// 276 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions. 277 /// 278 /// \param __a 279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 280 /// used in the calculation. 281 /// \returns A 128-bit vector of [4 x float] containing the approximate 282 /// reciprocal of the square root of the value in the low-order bits of the 283 /// operand. 284 static __inline__ __m128 __DEFAULT_FN_ATTRS 285 _mm_rsqrt_ss(__m128 __a) 286 { 287 return __builtin_ia32_rsqrtss((__v4sf)__a); 288 } 289 290 /// Calculates the approximate reciprocals of the square roots of the 291 /// values stored in a 128-bit vector of [4 x float]. 292 /// 293 /// \headerfile <x86intrin.h> 294 /// 295 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions. 296 /// 297 /// \param __a 298 /// A 128-bit vector of [4 x float]. 299 /// \returns A 128-bit vector of [4 x float] containing the approximate 300 /// reciprocals of the square roots of the values in the operand. 301 static __inline__ __m128 __DEFAULT_FN_ATTRS 302 _mm_rsqrt_ps(__m128 __a) 303 { 304 return __builtin_ia32_rsqrtps((__v4sf)__a); 305 } 306 307 /// Compares two 32-bit float values in the low-order bits of both 308 /// operands and returns the lesser value in the low-order bits of the 309 /// vector of [4 x float]. 310 /// 311 /// \headerfile <x86intrin.h> 312 /// 313 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions. 314 /// 315 /// \param __a 316 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 317 /// 32 bits of this operand are used in the comparison. 318 /// \param __b 319 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 320 /// 32 bits of this operand are used in the comparison. 321 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 322 /// minimum value between both operands. The upper 96 bits are copied from 323 /// the upper 96 bits of the first source operand. 324 static __inline__ __m128 __DEFAULT_FN_ATTRS 325 _mm_min_ss(__m128 __a, __m128 __b) 326 { 327 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 328 } 329 330 /// Compares two 128-bit vectors of [4 x float] and returns the lesser 331 /// of each pair of values. 332 /// 333 /// \headerfile <x86intrin.h> 334 /// 335 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions. 336 /// 337 /// \param __a 338 /// A 128-bit vector of [4 x float] containing one of the operands. 339 /// \param __b 340 /// A 128-bit vector of [4 x float] containing one of the operands. 341 /// \returns A 128-bit vector of [4 x float] containing the minimum values 342 /// between both operands. 343 static __inline__ __m128 __DEFAULT_FN_ATTRS 344 _mm_min_ps(__m128 __a, __m128 __b) 345 { 346 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 347 } 348 349 /// Compares two 32-bit float values in the low-order bits of both 350 /// operands and returns the greater value in the low-order bits of a 128-bit 351 /// vector of [4 x float]. 352 /// 353 /// \headerfile <x86intrin.h> 354 /// 355 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions. 356 /// 357 /// \param __a 358 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 359 /// 32 bits of this operand are used in the comparison. 360 /// \param __b 361 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 362 /// 32 bits of this operand are used in the comparison. 363 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 364 /// maximum value between both operands. The upper 96 bits are copied from 365 /// the upper 96 bits of the first source operand. 366 static __inline__ __m128 __DEFAULT_FN_ATTRS 367 _mm_max_ss(__m128 __a, __m128 __b) 368 { 369 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 370 } 371 372 /// Compares two 128-bit vectors of [4 x float] and returns the greater 373 /// of each pair of values. 374 /// 375 /// \headerfile <x86intrin.h> 376 /// 377 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions. 378 /// 379 /// \param __a 380 /// A 128-bit vector of [4 x float] containing one of the operands. 381 /// \param __b 382 /// A 128-bit vector of [4 x float] containing one of the operands. 383 /// \returns A 128-bit vector of [4 x float] containing the maximum values 384 /// between both operands. 385 static __inline__ __m128 __DEFAULT_FN_ATTRS 386 _mm_max_ps(__m128 __a, __m128 __b) 387 { 388 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 389 } 390 391 /// Performs a bitwise AND of two 128-bit vectors of [4 x float]. 392 /// 393 /// \headerfile <x86intrin.h> 394 /// 395 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions. 396 /// 397 /// \param __a 398 /// A 128-bit vector containing one of the source operands. 399 /// \param __b 400 /// A 128-bit vector containing one of the source operands. 401 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 402 /// values between both operands. 403 static __inline__ __m128 __DEFAULT_FN_ATTRS 404 _mm_and_ps(__m128 __a, __m128 __b) 405 { 406 return (__m128)((__v4su)__a & (__v4su)__b); 407 } 408 409 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using 410 /// the one's complement of the values contained in the first source 411 /// operand. 412 /// 413 /// \headerfile <x86intrin.h> 414 /// 415 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions. 416 /// 417 /// \param __a 418 /// A 128-bit vector of [4 x float] containing the first source operand. The 419 /// one's complement of this value is used in the bitwise AND. 420 /// \param __b 421 /// A 128-bit vector of [4 x float] containing the second source operand. 422 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 423 /// one's complement of the first operand and the values in the second 424 /// operand. 425 static __inline__ __m128 __DEFAULT_FN_ATTRS 426 _mm_andnot_ps(__m128 __a, __m128 __b) 427 { 428 return (__m128)(~(__v4su)__a & (__v4su)__b); 429 } 430 431 /// Performs a bitwise OR of two 128-bit vectors of [4 x float]. 432 /// 433 /// \headerfile <x86intrin.h> 434 /// 435 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions. 436 /// 437 /// \param __a 438 /// A 128-bit vector of [4 x float] containing one of the source operands. 439 /// \param __b 440 /// A 128-bit vector of [4 x float] containing one of the source operands. 441 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the 442 /// values between both operands. 443 static __inline__ __m128 __DEFAULT_FN_ATTRS 444 _mm_or_ps(__m128 __a, __m128 __b) 445 { 446 return (__m128)((__v4su)__a | (__v4su)__b); 447 } 448 449 /// Performs a bitwise exclusive OR of two 128-bit vectors of 450 /// [4 x float]. 451 /// 452 /// \headerfile <x86intrin.h> 453 /// 454 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions. 455 /// 456 /// \param __a 457 /// A 128-bit vector of [4 x float] containing one of the source operands. 458 /// \param __b 459 /// A 128-bit vector of [4 x float] containing one of the source operands. 460 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 461 /// of the values between both operands. 462 static __inline__ __m128 __DEFAULT_FN_ATTRS 463 _mm_xor_ps(__m128 __a, __m128 __b) 464 { 465 return (__m128)((__v4su)__a ^ (__v4su)__b); 466 } 467 468 /// Compares two 32-bit float values in the low-order bits of both 469 /// operands for equality and returns the result of the comparison in the 470 /// low-order bits of a vector [4 x float]. 471 /// 472 /// \headerfile <x86intrin.h> 473 /// 474 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions. 475 /// 476 /// \param __a 477 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 478 /// 32 bits of this operand are used in the comparison. 479 /// \param __b 480 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 481 /// 32 bits of this operand are used in the comparison. 482 /// \returns A 128-bit vector of [4 x float] containing the comparison results 483 /// in the low-order bits. 484 static __inline__ __m128 __DEFAULT_FN_ATTRS 485 _mm_cmpeq_ss(__m128 __a, __m128 __b) 486 { 487 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 488 } 489 490 /// Compares each of the corresponding 32-bit float values of the 491 /// 128-bit vectors of [4 x float] for equality. 492 /// 493 /// \headerfile <x86intrin.h> 494 /// 495 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions. 496 /// 497 /// \param __a 498 /// A 128-bit vector of [4 x float]. 499 /// \param __b 500 /// A 128-bit vector of [4 x float]. 501 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 502 static __inline__ __m128 __DEFAULT_FN_ATTRS 503 _mm_cmpeq_ps(__m128 __a, __m128 __b) 504 { 505 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 506 } 507 508 /// Compares two 32-bit float values in the low-order bits of both 509 /// operands to determine if the value in the first operand is less than the 510 /// corresponding value in the second operand and returns the result of the 511 /// comparison in the low-order bits of a vector of [4 x float]. 512 /// 513 /// \headerfile <x86intrin.h> 514 /// 515 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 516 /// 517 /// \param __a 518 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 519 /// 32 bits of this operand are used in the comparison. 520 /// \param __b 521 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 522 /// 32 bits of this operand are used in the comparison. 523 /// \returns A 128-bit vector of [4 x float] containing the comparison results 524 /// in the low-order bits. 525 static __inline__ __m128 __DEFAULT_FN_ATTRS 526 _mm_cmplt_ss(__m128 __a, __m128 __b) 527 { 528 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 529 } 530 531 /// Compares each of the corresponding 32-bit float values of the 532 /// 128-bit vectors of [4 x float] to determine if the values in the first 533 /// operand are less than those in the second operand. 534 /// 535 /// \headerfile <x86intrin.h> 536 /// 537 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 538 /// 539 /// \param __a 540 /// A 128-bit vector of [4 x float]. 541 /// \param __b 542 /// A 128-bit vector of [4 x float]. 543 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 544 static __inline__ __m128 __DEFAULT_FN_ATTRS 545 _mm_cmplt_ps(__m128 __a, __m128 __b) 546 { 547 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 548 } 549 550 /// Compares two 32-bit float values in the low-order bits of both 551 /// operands to determine if the value in the first operand is less than or 552 /// equal to the corresponding value in the second operand and returns the 553 /// result of the comparison in the low-order bits of a vector of 554 /// [4 x float]. 555 /// 556 /// \headerfile <x86intrin.h> 557 /// 558 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 559 /// 560 /// \param __a 561 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 562 /// 32 bits of this operand are used in the comparison. 563 /// \param __b 564 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 565 /// 32 bits of this operand are used in the comparison. 566 /// \returns A 128-bit vector of [4 x float] containing the comparison results 567 /// in the low-order bits. 568 static __inline__ __m128 __DEFAULT_FN_ATTRS 569 _mm_cmple_ss(__m128 __a, __m128 __b) 570 { 571 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 572 } 573 574 /// Compares each of the corresponding 32-bit float values of the 575 /// 128-bit vectors of [4 x float] to determine if the values in the first 576 /// operand are less than or equal to those in the second operand. 577 /// 578 /// \headerfile <x86intrin.h> 579 /// 580 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 581 /// 582 /// \param __a 583 /// A 128-bit vector of [4 x float]. 584 /// \param __b 585 /// A 128-bit vector of [4 x float]. 586 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 587 static __inline__ __m128 __DEFAULT_FN_ATTRS 588 _mm_cmple_ps(__m128 __a, __m128 __b) 589 { 590 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 591 } 592 593 /// Compares two 32-bit float values in the low-order bits of both 594 /// operands to determine if the value in the first operand is greater than 595 /// the corresponding value in the second operand and returns the result of 596 /// the comparison in the low-order bits of a vector of [4 x float]. 597 /// 598 /// \headerfile <x86intrin.h> 599 /// 600 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. 601 /// 602 /// \param __a 603 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 604 /// 32 bits of this operand are used in the comparison. 605 /// \param __b 606 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 607 /// 32 bits of this operand are used in the comparison. 608 /// \returns A 128-bit vector of [4 x float] containing the comparison results 609 /// in the low-order bits. 610 static __inline__ __m128 __DEFAULT_FN_ATTRS 611 _mm_cmpgt_ss(__m128 __a, __m128 __b) 612 { 613 return (__m128)__builtin_shufflevector((__v4sf)__a, 614 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 615 4, 1, 2, 3); 616 } 617 618 /// Compares each of the corresponding 32-bit float values of the 619 /// 128-bit vectors of [4 x float] to determine if the values in the first 620 /// operand are greater than those in the second operand. 621 /// 622 /// \headerfile <x86intrin.h> 623 /// 624 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. 625 /// 626 /// \param __a 627 /// A 128-bit vector of [4 x float]. 628 /// \param __b 629 /// A 128-bit vector of [4 x float]. 630 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 631 static __inline__ __m128 __DEFAULT_FN_ATTRS 632 _mm_cmpgt_ps(__m128 __a, __m128 __b) 633 { 634 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 635 } 636 637 /// Compares two 32-bit float values in the low-order bits of both 638 /// operands to determine if the value in the first operand is greater than 639 /// or equal to the corresponding value in the second operand and returns 640 /// the result of the comparison in the low-order bits of a vector of 641 /// [4 x float]. 642 /// 643 /// \headerfile <x86intrin.h> 644 /// 645 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. 646 /// 647 /// \param __a 648 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 649 /// 32 bits of this operand are used in the comparison. 650 /// \param __b 651 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 652 /// 32 bits of this operand are used in the comparison. 653 /// \returns A 128-bit vector of [4 x float] containing the comparison results 654 /// in the low-order bits. 655 static __inline__ __m128 __DEFAULT_FN_ATTRS 656 _mm_cmpge_ss(__m128 __a, __m128 __b) 657 { 658 return (__m128)__builtin_shufflevector((__v4sf)__a, 659 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 660 4, 1, 2, 3); 661 } 662 663 /// Compares each of the corresponding 32-bit float values of the 664 /// 128-bit vectors of [4 x float] to determine if the values in the first 665 /// operand are greater than or equal to those in the second operand. 666 /// 667 /// \headerfile <x86intrin.h> 668 /// 669 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. 670 /// 671 /// \param __a 672 /// A 128-bit vector of [4 x float]. 673 /// \param __b 674 /// A 128-bit vector of [4 x float]. 675 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 676 static __inline__ __m128 __DEFAULT_FN_ATTRS 677 _mm_cmpge_ps(__m128 __a, __m128 __b) 678 { 679 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 680 } 681 682 /// Compares two 32-bit float values in the low-order bits of both 683 /// operands for inequality and returns the result of the comparison in the 684 /// low-order bits of a vector of [4 x float]. 685 /// 686 /// \headerfile <x86intrin.h> 687 /// 688 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c> 689 /// instructions. 690 /// 691 /// \param __a 692 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 693 /// 32 bits of this operand are used in the comparison. 694 /// \param __b 695 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 696 /// 32 bits of this operand are used in the comparison. 697 /// \returns A 128-bit vector of [4 x float] containing the comparison results 698 /// in the low-order bits. 699 static __inline__ __m128 __DEFAULT_FN_ATTRS 700 _mm_cmpneq_ss(__m128 __a, __m128 __b) 701 { 702 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 703 } 704 705 /// Compares each of the corresponding 32-bit float values of the 706 /// 128-bit vectors of [4 x float] for inequality. 707 /// 708 /// \headerfile <x86intrin.h> 709 /// 710 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c> 711 /// instructions. 712 /// 713 /// \param __a 714 /// A 128-bit vector of [4 x float]. 715 /// \param __b 716 /// A 128-bit vector of [4 x float]. 717 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 718 static __inline__ __m128 __DEFAULT_FN_ATTRS 719 _mm_cmpneq_ps(__m128 __a, __m128 __b) 720 { 721 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 722 } 723 724 /// Compares two 32-bit float values in the low-order bits of both 725 /// operands to determine if the value in the first operand is not less than 726 /// the corresponding value in the second operand and returns the result of 727 /// the comparison in the low-order bits of a vector of [4 x float]. 728 /// 729 /// \headerfile <x86intrin.h> 730 /// 731 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 732 /// instructions. 733 /// 734 /// \param __a 735 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 736 /// 32 bits of this operand are used in the comparison. 737 /// \param __b 738 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 739 /// 32 bits of this operand are used in the comparison. 740 /// \returns A 128-bit vector of [4 x float] containing the comparison results 741 /// in the low-order bits. 742 static __inline__ __m128 __DEFAULT_FN_ATTRS 743 _mm_cmpnlt_ss(__m128 __a, __m128 __b) 744 { 745 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 746 } 747 748 /// Compares each of the corresponding 32-bit float values of the 749 /// 128-bit vectors of [4 x float] to determine if the values in the first 750 /// operand are not less than those in the second operand. 751 /// 752 /// \headerfile <x86intrin.h> 753 /// 754 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 755 /// instructions. 756 /// 757 /// \param __a 758 /// A 128-bit vector of [4 x float]. 759 /// \param __b 760 /// A 128-bit vector of [4 x float]. 761 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 762 static __inline__ __m128 __DEFAULT_FN_ATTRS 763 _mm_cmpnlt_ps(__m128 __a, __m128 __b) 764 { 765 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 766 } 767 768 /// Compares two 32-bit float values in the low-order bits of both 769 /// operands to determine if the value in the first operand is not less than 770 /// or equal to the corresponding value in the second operand and returns 771 /// the result of the comparison in the low-order bits of a vector of 772 /// [4 x float]. 773 /// 774 /// \headerfile <x86intrin.h> 775 /// 776 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 777 /// instructions. 778 /// 779 /// \param __a 780 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 781 /// 32 bits of this operand are used in the comparison. 782 /// \param __b 783 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 784 /// 32 bits of this operand are used in the comparison. 785 /// \returns A 128-bit vector of [4 x float] containing the comparison results 786 /// in the low-order bits. 787 static __inline__ __m128 __DEFAULT_FN_ATTRS 788 _mm_cmpnle_ss(__m128 __a, __m128 __b) 789 { 790 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 791 } 792 793 /// Compares each of the corresponding 32-bit float values of the 794 /// 128-bit vectors of [4 x float] to determine if the values in the first 795 /// operand are not less than or equal to those in the second operand. 796 /// 797 /// \headerfile <x86intrin.h> 798 /// 799 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 800 /// instructions. 801 /// 802 /// \param __a 803 /// A 128-bit vector of [4 x float]. 804 /// \param __b 805 /// A 128-bit vector of [4 x float]. 806 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 807 static __inline__ __m128 __DEFAULT_FN_ATTRS 808 _mm_cmpnle_ps(__m128 __a, __m128 __b) 809 { 810 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 811 } 812 813 /// Compares two 32-bit float values in the low-order bits of both 814 /// operands to determine if the value in the first operand is not greater 815 /// than the corresponding value in the second operand and returns the 816 /// result of the comparison in the low-order bits of a vector of 817 /// [4 x float]. 818 /// 819 /// \headerfile <x86intrin.h> 820 /// 821 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> 822 /// instructions. 823 /// 824 /// \param __a 825 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 826 /// 32 bits of this operand are used in the comparison. 827 /// \param __b 828 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 829 /// 32 bits of this operand are used in the comparison. 830 /// \returns A 128-bit vector of [4 x float] containing the comparison results 831 /// in the low-order bits. 832 static __inline__ __m128 __DEFAULT_FN_ATTRS 833 _mm_cmpngt_ss(__m128 __a, __m128 __b) 834 { 835 return (__m128)__builtin_shufflevector((__v4sf)__a, 836 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 837 4, 1, 2, 3); 838 } 839 840 /// Compares each of the corresponding 32-bit float values of the 841 /// 128-bit vectors of [4 x float] to determine if the values in the first 842 /// operand are not greater than those in the second operand. 843 /// 844 /// \headerfile <x86intrin.h> 845 /// 846 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> 847 /// instructions. 848 /// 849 /// \param __a 850 /// A 128-bit vector of [4 x float]. 851 /// \param __b 852 /// A 128-bit vector of [4 x float]. 853 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 854 static __inline__ __m128 __DEFAULT_FN_ATTRS 855 _mm_cmpngt_ps(__m128 __a, __m128 __b) 856 { 857 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 858 } 859 860 /// Compares two 32-bit float values in the low-order bits of both 861 /// operands to determine if the value in the first operand is not greater 862 /// than or equal to the corresponding value in the second operand and 863 /// returns the result of the comparison in the low-order bits of a vector 864 /// of [4 x float]. 865 /// 866 /// \headerfile <x86intrin.h> 867 /// 868 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> 869 /// instructions. 870 /// 871 /// \param __a 872 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 873 /// 32 bits of this operand are used in the comparison. 874 /// \param __b 875 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 876 /// 32 bits of this operand are used in the comparison. 877 /// \returns A 128-bit vector of [4 x float] containing the comparison results 878 /// in the low-order bits. 879 static __inline__ __m128 __DEFAULT_FN_ATTRS 880 _mm_cmpnge_ss(__m128 __a, __m128 __b) 881 { 882 return (__m128)__builtin_shufflevector((__v4sf)__a, 883 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 884 4, 1, 2, 3); 885 } 886 887 /// Compares each of the corresponding 32-bit float values of the 888 /// 128-bit vectors of [4 x float] to determine if the values in the first 889 /// operand are not greater than or equal to those in the second operand. 890 /// 891 /// \headerfile <x86intrin.h> 892 /// 893 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> 894 /// instructions. 895 /// 896 /// \param __a 897 /// A 128-bit vector of [4 x float]. 898 /// \param __b 899 /// A 128-bit vector of [4 x float]. 900 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 901 static __inline__ __m128 __DEFAULT_FN_ATTRS 902 _mm_cmpnge_ps(__m128 __a, __m128 __b) 903 { 904 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 905 } 906 907 /// Compares two 32-bit float values in the low-order bits of both 908 /// operands to determine if the value in the first operand is ordered with 909 /// respect to the corresponding value in the second operand and returns the 910 /// result of the comparison in the low-order bits of a vector of 911 /// [4 x float]. 912 /// 913 /// \headerfile <x86intrin.h> 914 /// 915 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c> 916 /// instructions. 917 /// 918 /// \param __a 919 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 920 /// 32 bits of this operand are used in the comparison. 921 /// \param __b 922 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 923 /// 32 bits of this operand are used in the comparison. 924 /// \returns A 128-bit vector of [4 x float] containing the comparison results 925 /// in the low-order bits. 926 static __inline__ __m128 __DEFAULT_FN_ATTRS 927 _mm_cmpord_ss(__m128 __a, __m128 __b) 928 { 929 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 930 } 931 932 /// Compares each of the corresponding 32-bit float values of the 933 /// 128-bit vectors of [4 x float] to determine if the values in the first 934 /// operand are ordered with respect to those in the second operand. 935 /// 936 /// \headerfile <x86intrin.h> 937 /// 938 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c> 939 /// instructions. 940 /// 941 /// \param __a 942 /// A 128-bit vector of [4 x float]. 943 /// \param __b 944 /// A 128-bit vector of [4 x float]. 945 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 946 static __inline__ __m128 __DEFAULT_FN_ATTRS 947 _mm_cmpord_ps(__m128 __a, __m128 __b) 948 { 949 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 950 } 951 952 /// Compares two 32-bit float values in the low-order bits of both 953 /// operands to determine if the value in the first operand is unordered 954 /// with respect to the corresponding value in the second operand and 955 /// returns the result of the comparison in the low-order bits of a vector 956 /// of [4 x float]. 957 /// 958 /// \headerfile <x86intrin.h> 959 /// 960 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c> 961 /// instructions. 962 /// 963 /// \param __a 964 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 965 /// 32 bits of this operand are used in the comparison. 966 /// \param __b 967 /// A 128-bit vector of [4 x float] containing one of the operands. The lower 968 /// 32 bits of this operand are used in the comparison. 969 /// \returns A 128-bit vector of [4 x float] containing the comparison results 970 /// in the low-order bits. 971 static __inline__ __m128 __DEFAULT_FN_ATTRS 972 _mm_cmpunord_ss(__m128 __a, __m128 __b) 973 { 974 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 975 } 976 977 /// Compares each of the corresponding 32-bit float values of the 978 /// 128-bit vectors of [4 x float] to determine if the values in the first 979 /// operand are unordered with respect to those in the second operand. 980 /// 981 /// \headerfile <x86intrin.h> 982 /// 983 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c> 984 /// instructions. 985 /// 986 /// \param __a 987 /// A 128-bit vector of [4 x float]. 988 /// \param __b 989 /// A 128-bit vector of [4 x float]. 990 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 991 static __inline__ __m128 __DEFAULT_FN_ATTRS 992 _mm_cmpunord_ps(__m128 __a, __m128 __b) 993 { 994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 995 } 996 997 /// Compares two 32-bit float values in the low-order bits of both 998 /// operands for equality and returns the result of the comparison. 999 /// 1000 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1001 /// 1002 /// \headerfile <x86intrin.h> 1003 /// 1004 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1005 /// instructions. 1006 /// 1007 /// \param __a 1008 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1009 /// used in the comparison. 1010 /// \param __b 1011 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1012 /// used in the comparison. 1013 /// \returns An integer containing the comparison results. If either of the 1014 /// two lower 32-bit values is NaN, 0 is returned. 1015 static __inline__ int __DEFAULT_FN_ATTRS 1016 _mm_comieq_ss(__m128 __a, __m128 __b) 1017 { 1018 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 1019 } 1020 1021 /// Compares two 32-bit float values in the low-order bits of both 1022 /// operands to determine if the first operand is less than the second 1023 /// operand and returns the result of the comparison. 1024 /// 1025 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1026 /// 1027 /// \headerfile <x86intrin.h> 1028 /// 1029 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 1030 /// instructions. 1031 /// 1032 /// \param __a 1033 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1034 /// used in the comparison. 1035 /// \param __b 1036 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1037 /// used in the comparison. 1038 /// \returns An integer containing the comparison results. If either of the two 1039 /// lower 32-bit values is NaN, 0 is returned. 1040 static __inline__ int __DEFAULT_FN_ATTRS 1041 _mm_comilt_ss(__m128 __a, __m128 __b) 1042 { 1043 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 1044 } 1045 1046 /// Compares two 32-bit float values in the low-order bits of both 1047 /// operands to determine if the first operand is less than or equal to the 1048 /// second operand and returns the result of the comparison. 1049 /// 1050 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1051 /// 1052 /// \headerfile <x86intrin.h> 1053 /// 1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1055 /// 1056 /// \param __a 1057 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1058 /// used in the comparison. 1059 /// \param __b 1060 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1061 /// used in the comparison. 1062 /// \returns An integer containing the comparison results. If either of the two 1063 /// lower 32-bit values is NaN, 0 is returned. 1064 static __inline__ int __DEFAULT_FN_ATTRS 1065 _mm_comile_ss(__m128 __a, __m128 __b) 1066 { 1067 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 1068 } 1069 1070 /// Compares two 32-bit float values in the low-order bits of both 1071 /// operands to determine if the first operand is greater than the second 1072 /// operand and returns the result of the comparison. 1073 /// 1074 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1075 /// 1076 /// \headerfile <x86intrin.h> 1077 /// 1078 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1079 /// 1080 /// \param __a 1081 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1082 /// used in the comparison. 1083 /// \param __b 1084 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1085 /// used in the comparison. 1086 /// \returns An integer containing the comparison results. If either of the 1087 /// two lower 32-bit values is NaN, 0 is returned. 1088 static __inline__ int __DEFAULT_FN_ATTRS 1089 _mm_comigt_ss(__m128 __a, __m128 __b) 1090 { 1091 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 1092 } 1093 1094 /// Compares two 32-bit float values in the low-order bits of both 1095 /// operands to determine if the first operand is greater than or equal to 1096 /// the second operand and returns the result of the comparison. 1097 /// 1098 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1099 /// 1100 /// \headerfile <x86intrin.h> 1101 /// 1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1103 /// 1104 /// \param __a 1105 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1106 /// used in the comparison. 1107 /// \param __b 1108 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1109 /// used in the comparison. 1110 /// \returns An integer containing the comparison results. If either of the two 1111 /// lower 32-bit values is NaN, 0 is returned. 1112 static __inline__ int __DEFAULT_FN_ATTRS 1113 _mm_comige_ss(__m128 __a, __m128 __b) 1114 { 1115 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 1116 } 1117 1118 /// Compares two 32-bit float values in the low-order bits of both 1119 /// operands to determine if the first operand is not equal to the second 1120 /// operand and returns the result of the comparison. 1121 /// 1122 /// If either of the two lower 32-bit values is NaN, 1 is returned. 1123 /// 1124 /// \headerfile <x86intrin.h> 1125 /// 1126 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. 1127 /// 1128 /// \param __a 1129 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1130 /// used in the comparison. 1131 /// \param __b 1132 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1133 /// used in the comparison. 1134 /// \returns An integer containing the comparison results. If either of the 1135 /// two lower 32-bit values is NaN, 1 is returned. 1136 static __inline__ int __DEFAULT_FN_ATTRS 1137 _mm_comineq_ss(__m128 __a, __m128 __b) 1138 { 1139 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 1140 } 1141 1142 /// Performs an unordered comparison of two 32-bit float values using 1143 /// the low-order bits of both operands to determine equality and returns 1144 /// the result of the comparison. 1145 /// 1146 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1147 /// 1148 /// \headerfile <x86intrin.h> 1149 /// 1150 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1151 /// 1152 /// \param __a 1153 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1154 /// used in the comparison. 1155 /// \param __b 1156 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1157 /// used in the comparison. 1158 /// \returns An integer containing the comparison results. If either of the two 1159 /// lower 32-bit values is NaN, 0 is returned. 1160 static __inline__ int __DEFAULT_FN_ATTRS 1161 _mm_ucomieq_ss(__m128 __a, __m128 __b) 1162 { 1163 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 1164 } 1165 1166 /// Performs an unordered comparison of two 32-bit float values using 1167 /// the low-order bits of both operands to determine if the first operand is 1168 /// less than the second operand and returns the result of the comparison. 1169 /// 1170 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1171 /// 1172 /// \headerfile <x86intrin.h> 1173 /// 1174 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1175 /// 1176 /// \param __a 1177 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1178 /// used in the comparison. 1179 /// \param __b 1180 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1181 /// used in the comparison. 1182 /// \returns An integer containing the comparison results. If either of the two 1183 /// lower 32-bit values is NaN, 0 is returned. 1184 static __inline__ int __DEFAULT_FN_ATTRS 1185 _mm_ucomilt_ss(__m128 __a, __m128 __b) 1186 { 1187 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 1188 } 1189 1190 /// Performs an unordered comparison of two 32-bit float values using 1191 /// the low-order bits of both operands to determine if the first operand is 1192 /// less than or equal to the second operand and returns the result of the 1193 /// comparison. 1194 /// 1195 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1196 /// 1197 /// \headerfile <x86intrin.h> 1198 /// 1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1200 /// 1201 /// \param __a 1202 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1203 /// used in the comparison. 1204 /// \param __b 1205 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1206 /// used in the comparison. 1207 /// \returns An integer containing the comparison results. If either of the two 1208 /// lower 32-bit values is NaN, 0 is returned. 1209 static __inline__ int __DEFAULT_FN_ATTRS 1210 _mm_ucomile_ss(__m128 __a, __m128 __b) 1211 { 1212 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 1213 } 1214 1215 /// Performs an unordered comparison of two 32-bit float values using 1216 /// the low-order bits of both operands to determine if the first operand is 1217 /// greater than the second operand and returns the result of the 1218 /// comparison. 1219 /// 1220 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1221 /// 1222 /// \headerfile <x86intrin.h> 1223 /// 1224 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1225 /// 1226 /// \param __a 1227 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1228 /// used in the comparison. 1229 /// \param __b 1230 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1231 /// used in the comparison. 1232 /// \returns An integer containing the comparison results. If either of the two 1233 /// lower 32-bit values is NaN, 0 is returned. 1234 static __inline__ int __DEFAULT_FN_ATTRS 1235 _mm_ucomigt_ss(__m128 __a, __m128 __b) 1236 { 1237 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 1238 } 1239 1240 /// Performs an unordered comparison of two 32-bit float values using 1241 /// the low-order bits of both operands to determine if the first operand is 1242 /// greater than or equal to the second operand and returns the result of 1243 /// the comparison. 1244 /// 1245 /// If either of the two lower 32-bit values is NaN, 0 is returned. 1246 /// 1247 /// \headerfile <x86intrin.h> 1248 /// 1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1250 /// 1251 /// \param __a 1252 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1253 /// used in the comparison. 1254 /// \param __b 1255 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1256 /// used in the comparison. 1257 /// \returns An integer containing the comparison results. If either of the two 1258 /// lower 32-bit values is NaN, 0 is returned. 1259 static __inline__ int __DEFAULT_FN_ATTRS 1260 _mm_ucomige_ss(__m128 __a, __m128 __b) 1261 { 1262 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 1263 } 1264 1265 /// Performs an unordered comparison of two 32-bit float values using 1266 /// the low-order bits of both operands to determine inequality and returns 1267 /// the result of the comparison. 1268 /// 1269 /// If either of the two lower 32-bit values is NaN, 1 is returned. 1270 /// 1271 /// \headerfile <x86intrin.h> 1272 /// 1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. 1274 /// 1275 /// \param __a 1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1277 /// used in the comparison. 1278 /// \param __b 1279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1280 /// used in the comparison. 1281 /// \returns An integer containing the comparison results. If either of the two 1282 /// lower 32-bit values is NaN, 1 is returned. 1283 static __inline__ int __DEFAULT_FN_ATTRS 1284 _mm_ucomineq_ss(__m128 __a, __m128 __b) 1285 { 1286 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 1287 } 1288 1289 /// Converts a float value contained in the lower 32 bits of a vector of 1290 /// [4 x float] into a 32-bit integer. 1291 /// 1292 /// \headerfile <x86intrin.h> 1293 /// 1294 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1295 /// instructions. 1296 /// 1297 /// \param __a 1298 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1299 /// used in the conversion. 1300 /// \returns A 32-bit integer containing the converted value. 1301 static __inline__ int __DEFAULT_FN_ATTRS 1302 _mm_cvtss_si32(__m128 __a) 1303 { 1304 return __builtin_ia32_cvtss2si((__v4sf)__a); 1305 } 1306 1307 /// Converts a float value contained in the lower 32 bits of a vector of 1308 /// [4 x float] into a 32-bit integer. 1309 /// 1310 /// \headerfile <x86intrin.h> 1311 /// 1312 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1313 /// instructions. 1314 /// 1315 /// \param __a 1316 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1317 /// used in the conversion. 1318 /// \returns A 32-bit integer containing the converted value. 1319 static __inline__ int __DEFAULT_FN_ATTRS 1320 _mm_cvt_ss2si(__m128 __a) 1321 { 1322 return _mm_cvtss_si32(__a); 1323 } 1324 1325 #ifdef __x86_64__ 1326 1327 /// Converts a float value contained in the lower 32 bits of a vector of 1328 /// [4 x float] into a 64-bit integer. 1329 /// 1330 /// \headerfile <x86intrin.h> 1331 /// 1332 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> 1333 /// instructions. 1334 /// 1335 /// \param __a 1336 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1337 /// used in the conversion. 1338 /// \returns A 64-bit integer containing the converted value. 1339 static __inline__ long long __DEFAULT_FN_ATTRS 1340 _mm_cvtss_si64(__m128 __a) 1341 { 1342 return __builtin_ia32_cvtss2si64((__v4sf)__a); 1343 } 1344 1345 #endif 1346 1347 /// Converts two low-order float values in a 128-bit vector of 1348 /// [4 x float] into a 64-bit vector of [2 x i32]. 1349 /// 1350 /// \headerfile <x86intrin.h> 1351 /// 1352 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1353 /// 1354 /// \param __a 1355 /// A 128-bit vector of [4 x float]. 1356 /// \returns A 64-bit integer vector containing the converted values. 1357 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1358 _mm_cvtps_pi32(__m128 __a) 1359 { 1360 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 1361 } 1362 1363 /// Converts two low-order float values in a 128-bit vector of 1364 /// [4 x float] into a 64-bit vector of [2 x i32]. 1365 /// 1366 /// \headerfile <x86intrin.h> 1367 /// 1368 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. 1369 /// 1370 /// \param __a 1371 /// A 128-bit vector of [4 x float]. 1372 /// \returns A 64-bit integer vector containing the converted values. 1373 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1374 _mm_cvt_ps2pi(__m128 __a) 1375 { 1376 return _mm_cvtps_pi32(__a); 1377 } 1378 1379 /// Converts a float value contained in the lower 32 bits of a vector of 1380 /// [4 x float] into a 32-bit integer, truncating the result when it is 1381 /// inexact. 1382 /// 1383 /// \headerfile <x86intrin.h> 1384 /// 1385 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1386 /// instructions. 1387 /// 1388 /// \param __a 1389 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1390 /// used in the conversion. 1391 /// \returns A 32-bit integer containing the converted value. 1392 static __inline__ int __DEFAULT_FN_ATTRS 1393 _mm_cvttss_si32(__m128 __a) 1394 { 1395 return __builtin_ia32_cvttss2si((__v4sf)__a); 1396 } 1397 1398 /// Converts a float value contained in the lower 32 bits of a vector of 1399 /// [4 x float] into a 32-bit integer, truncating the result when it is 1400 /// inexact. 1401 /// 1402 /// \headerfile <x86intrin.h> 1403 /// 1404 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1405 /// instructions. 1406 /// 1407 /// \param __a 1408 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1409 /// used in the conversion. 1410 /// \returns A 32-bit integer containing the converted value. 1411 static __inline__ int __DEFAULT_FN_ATTRS 1412 _mm_cvtt_ss2si(__m128 __a) 1413 { 1414 return _mm_cvttss_si32(__a); 1415 } 1416 1417 #ifdef __x86_64__ 1418 /// Converts a float value contained in the lower 32 bits of a vector of 1419 /// [4 x float] into a 64-bit integer, truncating the result when it is 1420 /// inexact. 1421 /// 1422 /// \headerfile <x86intrin.h> 1423 /// 1424 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> 1425 /// instructions. 1426 /// 1427 /// \param __a 1428 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1429 /// used in the conversion. 1430 /// \returns A 64-bit integer containing the converted value. 1431 static __inline__ long long __DEFAULT_FN_ATTRS 1432 _mm_cvttss_si64(__m128 __a) 1433 { 1434 return __builtin_ia32_cvttss2si64((__v4sf)__a); 1435 } 1436 #endif 1437 1438 /// Converts two low-order float values in a 128-bit vector of 1439 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result 1440 /// when it is inexact. 1441 /// 1442 /// \headerfile <x86intrin.h> 1443 /// 1444 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c> 1445 /// instructions. 1446 /// 1447 /// \param __a 1448 /// A 128-bit vector of [4 x float]. 1449 /// \returns A 64-bit integer vector containing the converted values. 1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1451 _mm_cvttps_pi32(__m128 __a) 1452 { 1453 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 1454 } 1455 1456 /// Converts two low-order float values in a 128-bit vector of [4 x 1457 /// float] into a 64-bit vector of [2 x i32], truncating the result when it 1458 /// is inexact. 1459 /// 1460 /// \headerfile <x86intrin.h> 1461 /// 1462 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction. 1463 /// 1464 /// \param __a 1465 /// A 128-bit vector of [4 x float]. 1466 /// \returns A 64-bit integer vector containing the converted values. 1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1468 _mm_cvtt_ps2pi(__m128 __a) 1469 { 1470 return _mm_cvttps_pi32(__a); 1471 } 1472 1473 /// Converts a 32-bit signed integer value into a floating point value 1474 /// and writes it to the lower 32 bits of the destination. The remaining 1475 /// higher order elements of the destination vector are copied from the 1476 /// corresponding elements in the first operand. 1477 /// 1478 /// \headerfile <x86intrin.h> 1479 /// 1480 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1481 /// 1482 /// \param __a 1483 /// A 128-bit vector of [4 x float]. 1484 /// \param __b 1485 /// A 32-bit signed integer operand containing the value to be converted. 1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1487 /// converted value of the second operand. The upper 96 bits are copied from 1488 /// the upper 96 bits of the first operand. 1489 static __inline__ __m128 __DEFAULT_FN_ATTRS 1490 _mm_cvtsi32_ss(__m128 __a, int __b) 1491 { 1492 __a[0] = __b; 1493 return __a; 1494 } 1495 1496 /// Converts a 32-bit signed integer value into a floating point value 1497 /// and writes it to the lower 32 bits of the destination. The remaining 1498 /// higher order elements of the destination are copied from the 1499 /// corresponding elements in the first operand. 1500 /// 1501 /// \headerfile <x86intrin.h> 1502 /// 1503 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1504 /// 1505 /// \param __a 1506 /// A 128-bit vector of [4 x float]. 1507 /// \param __b 1508 /// A 32-bit signed integer operand containing the value to be converted. 1509 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1510 /// converted value of the second operand. The upper 96 bits are copied from 1511 /// the upper 96 bits of the first operand. 1512 static __inline__ __m128 __DEFAULT_FN_ATTRS 1513 _mm_cvt_si2ss(__m128 __a, int __b) 1514 { 1515 return _mm_cvtsi32_ss(__a, __b); 1516 } 1517 1518 #ifdef __x86_64__ 1519 1520 /// Converts a 64-bit signed integer value into a floating point value 1521 /// and writes it to the lower 32 bits of the destination. The remaining 1522 /// higher order elements of the destination are copied from the 1523 /// corresponding elements in the first operand. 1524 /// 1525 /// \headerfile <x86intrin.h> 1526 /// 1527 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. 1528 /// 1529 /// \param __a 1530 /// A 128-bit vector of [4 x float]. 1531 /// \param __b 1532 /// A 64-bit signed integer operand containing the value to be converted. 1533 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 1534 /// converted value of the second operand. The upper 96 bits are copied from 1535 /// the upper 96 bits of the first operand. 1536 static __inline__ __m128 __DEFAULT_FN_ATTRS 1537 _mm_cvtsi64_ss(__m128 __a, long long __b) 1538 { 1539 __a[0] = __b; 1540 return __a; 1541 } 1542 1543 #endif 1544 1545 /// Converts two elements of a 64-bit vector of [2 x i32] into two 1546 /// floating point values and writes them to the lower 64-bits of the 1547 /// destination. The remaining higher order elements of the destination are 1548 /// copied from the corresponding elements in the first operand. 1549 /// 1550 /// \headerfile <x86intrin.h> 1551 /// 1552 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1553 /// 1554 /// \param __a 1555 /// A 128-bit vector of [4 x float]. 1556 /// \param __b 1557 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1558 /// and written to the corresponding low-order elements in the destination. 1559 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1560 /// converted value of the second operand. The upper 64 bits are copied from 1561 /// the upper 64 bits of the first operand. 1562 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1563 _mm_cvtpi32_ps(__m128 __a, __m64 __b) 1564 { 1565 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 1566 } 1567 1568 /// Converts two elements of a 64-bit vector of [2 x i32] into two 1569 /// floating point values and writes them to the lower 64-bits of the 1570 /// destination. The remaining higher order elements of the destination are 1571 /// copied from the corresponding elements in the first operand. 1572 /// 1573 /// \headerfile <x86intrin.h> 1574 /// 1575 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. 1576 /// 1577 /// \param __a 1578 /// A 128-bit vector of [4 x float]. 1579 /// \param __b 1580 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted 1581 /// and written to the corresponding low-order elements in the destination. 1582 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1583 /// converted value from the second operand. The upper 64 bits are copied 1584 /// from the upper 64 bits of the first operand. 1585 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 1586 _mm_cvt_pi2ps(__m128 __a, __m64 __b) 1587 { 1588 return _mm_cvtpi32_ps(__a, __b); 1589 } 1590 1591 /// Extracts a float value contained in the lower 32 bits of a vector of 1592 /// [4 x float]. 1593 /// 1594 /// \headerfile <x86intrin.h> 1595 /// 1596 /// This intrinsic has no corresponding instruction. 1597 /// 1598 /// \param __a 1599 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1600 /// used in the extraction. 1601 /// \returns A 32-bit float containing the extracted value. 1602 static __inline__ float __DEFAULT_FN_ATTRS 1603 _mm_cvtss_f32(__m128 __a) 1604 { 1605 return __a[0]; 1606 } 1607 1608 /// Loads two packed float values from the address \a __p into the 1609 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits 1610 /// are copied from the low-order bits of the first operand. 1611 /// 1612 /// \headerfile <x86intrin.h> 1613 /// 1614 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1615 /// 1616 /// \param __a 1617 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] 1618 /// of the destination. 1619 /// \param __p 1620 /// A pointer to two packed float values. Bits [63:0] are written to bits 1621 /// [127:64] of the destination. 1622 /// \returns A 128-bit vector of [4 x float] containing the moved values. 1623 static __inline__ __m128 __DEFAULT_FN_ATTRS 1624 _mm_loadh_pi(__m128 __a, const __m64 *__p) 1625 { 1626 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 1627 struct __mm_loadh_pi_struct { 1628 __mm_loadh_pi_v2f32 __u; 1629 } __attribute__((__packed__, __may_alias__)); 1630 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; 1631 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1632 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 1633 } 1634 1635 /// Loads two packed float values from the address \a __p into the 1636 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits 1637 /// are copied from the high-order bits of the first operand. 1638 /// 1639 /// \headerfile <x86intrin.h> 1640 /// 1641 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1642 /// 1643 /// \param __a 1644 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits 1645 /// [127:64] of the destination. 1646 /// \param __p 1647 /// A pointer to two packed float values. Bits [63:0] are written to bits 1648 /// [63:0] of the destination. 1649 /// \returns A 128-bit vector of [4 x float] containing the moved values. 1650 static __inline__ __m128 __DEFAULT_FN_ATTRS 1651 _mm_loadl_pi(__m128 __a, const __m64 *__p) 1652 { 1653 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 1654 struct __mm_loadl_pi_struct { 1655 __mm_loadl_pi_v2f32 __u; 1656 } __attribute__((__packed__, __may_alias__)); 1657 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; 1658 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1659 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 1660 } 1661 1662 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1663 /// 32 bits of the vector are initialized with the single-precision 1664 /// floating-point value loaded from a specified memory location. The upper 1665 /// 96 bits are set to zero. 1666 /// 1667 /// \headerfile <x86intrin.h> 1668 /// 1669 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1670 /// 1671 /// \param __p 1672 /// A pointer to a 32-bit memory location containing a single-precision 1673 /// floating-point value. 1674 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1675 /// lower 32 bits contain the value loaded from the memory location. The 1676 /// upper 96 bits are set to zero. 1677 static __inline__ __m128 __DEFAULT_FN_ATTRS 1678 _mm_load_ss(const float *__p) 1679 { 1680 struct __mm_load_ss_struct { 1681 float __u; 1682 } __attribute__((__packed__, __may_alias__)); 1683 float __u = ((const struct __mm_load_ss_struct*)__p)->__u; 1684 return __extension__ (__m128){ __u, 0, 0, 0 }; 1685 } 1686 1687 /// Loads a 32-bit float value and duplicates it to all four vector 1688 /// elements of a 128-bit vector of [4 x float]. 1689 /// 1690 /// \headerfile <x86intrin.h> 1691 /// 1692 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c> 1693 /// instruction. 1694 /// 1695 /// \param __p 1696 /// A pointer to a float value to be loaded and duplicated. 1697 /// \returns A 128-bit vector of [4 x float] containing the loaded and 1698 /// duplicated values. 1699 static __inline__ __m128 __DEFAULT_FN_ATTRS 1700 _mm_load1_ps(const float *__p) 1701 { 1702 struct __mm_load1_ps_struct { 1703 float __u; 1704 } __attribute__((__packed__, __may_alias__)); 1705 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u; 1706 return __extension__ (__m128){ __u, __u, __u, __u }; 1707 } 1708 1709 #define _mm_load_ps1(p) _mm_load1_ps(p) 1710 1711 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned 1712 /// memory location. 1713 /// 1714 /// \headerfile <x86intrin.h> 1715 /// 1716 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 1717 /// 1718 /// \param __p 1719 /// A pointer to a 128-bit memory location. The address of the memory 1720 /// location has to be 128-bit aligned. 1721 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 1722 static __inline__ __m128 __DEFAULT_FN_ATTRS 1723 _mm_load_ps(const float *__p) 1724 { 1725 return *(const __m128*)__p; 1726 } 1727 1728 /// Loads a 128-bit floating-point vector of [4 x float] from an 1729 /// unaligned memory location. 1730 /// 1731 /// \headerfile <x86intrin.h> 1732 /// 1733 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1734 /// 1735 /// \param __p 1736 /// A pointer to a 128-bit memory location. The address of the memory 1737 /// location does not have to be aligned. 1738 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 1739 static __inline__ __m128 __DEFAULT_FN_ATTRS 1740 _mm_loadu_ps(const float *__p) 1741 { 1742 struct __loadu_ps { 1743 __m128_u __v; 1744 } __attribute__((__packed__, __may_alias__)); 1745 return ((const struct __loadu_ps*)__p)->__v; 1746 } 1747 1748 /// Loads four packed float values, in reverse order, from an aligned 1749 /// memory location to 32-bit elements in a 128-bit vector of [4 x float]. 1750 /// 1751 /// \headerfile <x86intrin.h> 1752 /// 1753 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 1754 /// instruction. 1755 /// 1756 /// \param __p 1757 /// A pointer to a 128-bit memory location. The address of the memory 1758 /// location has to be 128-bit aligned. 1759 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded 1760 /// in reverse order. 1761 static __inline__ __m128 __DEFAULT_FN_ATTRS 1762 _mm_loadr_ps(const float *__p) 1763 { 1764 __m128 __a = _mm_load_ps(__p); 1765 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1766 } 1767 1768 /// Create a 128-bit vector of [4 x float] with undefined values. 1769 /// 1770 /// \headerfile <x86intrin.h> 1771 /// 1772 /// This intrinsic has no corresponding instruction. 1773 /// 1774 /// \returns A 128-bit vector of [4 x float] containing undefined values. 1775 static __inline__ __m128 __DEFAULT_FN_ATTRS 1776 _mm_undefined_ps(void) 1777 { 1778 return (__m128)__builtin_ia32_undef128(); 1779 } 1780 1781 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower 1782 /// 32 bits of the vector are initialized with the specified single-precision 1783 /// floating-point value. The upper 96 bits are set to zero. 1784 /// 1785 /// \headerfile <x86intrin.h> 1786 /// 1787 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1788 /// 1789 /// \param __w 1790 /// A single-precision floating-point value used to initialize the lower 32 1791 /// bits of the result. 1792 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The 1793 /// lower 32 bits contain the value provided in the source operand. The 1794 /// upper 96 bits are set to zero. 1795 static __inline__ __m128 __DEFAULT_FN_ATTRS 1796 _mm_set_ss(float __w) 1797 { 1798 return __extension__ (__m128){ __w, 0, 0, 0 }; 1799 } 1800 1801 /// Constructs a 128-bit floating-point vector of [4 x float], with each 1802 /// of the four single-precision floating-point vector elements set to the 1803 /// specified single-precision floating-point value. 1804 /// 1805 /// \headerfile <x86intrin.h> 1806 /// 1807 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1808 /// 1809 /// \param __w 1810 /// A single-precision floating-point value used to initialize each vector 1811 /// element of the result. 1812 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1813 static __inline__ __m128 __DEFAULT_FN_ATTRS 1814 _mm_set1_ps(float __w) 1815 { 1816 return __extension__ (__m128){ __w, __w, __w, __w }; 1817 } 1818 1819 /* Microsoft specific. */ 1820 /// Constructs a 128-bit floating-point vector of [4 x float], with each 1821 /// of the four single-precision floating-point vector elements set to the 1822 /// specified single-precision floating-point value. 1823 /// 1824 /// \headerfile <x86intrin.h> 1825 /// 1826 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. 1827 /// 1828 /// \param __w 1829 /// A single-precision floating-point value used to initialize each vector 1830 /// element of the result. 1831 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1832 static __inline__ __m128 __DEFAULT_FN_ATTRS 1833 _mm_set_ps1(float __w) 1834 { 1835 return _mm_set1_ps(__w); 1836 } 1837 1838 /// Constructs a 128-bit floating-point vector of [4 x float] 1839 /// initialized with the specified single-precision floating-point values. 1840 /// 1841 /// \headerfile <x86intrin.h> 1842 /// 1843 /// This intrinsic is a utility function and does not correspond to a specific 1844 /// instruction. 1845 /// 1846 /// \param __z 1847 /// A single-precision floating-point value used to initialize bits [127:96] 1848 /// of the result. 1849 /// \param __y 1850 /// A single-precision floating-point value used to initialize bits [95:64] 1851 /// of the result. 1852 /// \param __x 1853 /// A single-precision floating-point value used to initialize bits [63:32] 1854 /// of the result. 1855 /// \param __w 1856 /// A single-precision floating-point value used to initialize bits [31:0] 1857 /// of the result. 1858 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1859 static __inline__ __m128 __DEFAULT_FN_ATTRS 1860 _mm_set_ps(float __z, float __y, float __x, float __w) 1861 { 1862 return __extension__ (__m128){ __w, __x, __y, __z }; 1863 } 1864 1865 /// Constructs a 128-bit floating-point vector of [4 x float], 1866 /// initialized in reverse order with the specified 32-bit single-precision 1867 /// float-point values. 1868 /// 1869 /// \headerfile <x86intrin.h> 1870 /// 1871 /// This intrinsic is a utility function and does not correspond to a specific 1872 /// instruction. 1873 /// 1874 /// \param __z 1875 /// A single-precision floating-point value used to initialize bits [31:0] 1876 /// of the result. 1877 /// \param __y 1878 /// A single-precision floating-point value used to initialize bits [63:32] 1879 /// of the result. 1880 /// \param __x 1881 /// A single-precision floating-point value used to initialize bits [95:64] 1882 /// of the result. 1883 /// \param __w 1884 /// A single-precision floating-point value used to initialize bits [127:96] 1885 /// of the result. 1886 /// \returns An initialized 128-bit floating-point vector of [4 x float]. 1887 static __inline__ __m128 __DEFAULT_FN_ATTRS 1888 _mm_setr_ps(float __z, float __y, float __x, float __w) 1889 { 1890 return __extension__ (__m128){ __z, __y, __x, __w }; 1891 } 1892 1893 /// Constructs a 128-bit floating-point vector of [4 x float] initialized 1894 /// to zero. 1895 /// 1896 /// \headerfile <x86intrin.h> 1897 /// 1898 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1899 /// 1900 /// \returns An initialized 128-bit floating-point vector of [4 x float] with 1901 /// all elements set to zero. 1902 static __inline__ __m128 __DEFAULT_FN_ATTRS 1903 _mm_setzero_ps(void) 1904 { 1905 return __extension__ (__m128){ 0, 0, 0, 0 }; 1906 } 1907 1908 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a 1909 /// memory location. 1910 /// 1911 /// \headerfile <x86intrin.h> 1912 /// 1913 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1914 /// 1915 /// \param __p 1916 /// A pointer to a 64-bit memory location. 1917 /// \param __a 1918 /// A 128-bit vector of [4 x float] containing the values to be stored. 1919 static __inline__ void __DEFAULT_FN_ATTRS 1920 _mm_storeh_pi(__m64 *__p, __m128 __a) 1921 { 1922 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); 1923 struct __mm_storeh_pi_struct { 1924 __mm_storeh_pi_v2f32 __u; 1925 } __attribute__((__packed__, __may_alias__)); 1926 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); 1927 } 1928 1929 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a 1930 /// memory location. 1931 /// 1932 /// \headerfile <x86intrin.h> 1933 /// 1934 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 1935 /// 1936 /// \param __p 1937 /// A pointer to a memory location that will receive the float values. 1938 /// \param __a 1939 /// A 128-bit vector of [4 x float] containing the values to be stored. 1940 static __inline__ void __DEFAULT_FN_ATTRS 1941 _mm_storel_pi(__m64 *__p, __m128 __a) 1942 { 1943 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); 1944 struct __mm_storeh_pi_struct { 1945 __mm_storeh_pi_v2f32 __u; 1946 } __attribute__((__packed__, __may_alias__)); 1947 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); 1948 } 1949 1950 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a 1951 /// memory location. 1952 /// 1953 /// \headerfile <x86intrin.h> 1954 /// 1955 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. 1956 /// 1957 /// \param __p 1958 /// A pointer to a 32-bit memory location. 1959 /// \param __a 1960 /// A 128-bit vector of [4 x float] containing the value to be stored. 1961 static __inline__ void __DEFAULT_FN_ATTRS 1962 _mm_store_ss(float *__p, __m128 __a) 1963 { 1964 struct __mm_store_ss_struct { 1965 float __u; 1966 } __attribute__((__packed__, __may_alias__)); 1967 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 1968 } 1969 1970 /// Stores a 128-bit vector of [4 x float] to an unaligned memory 1971 /// location. 1972 /// 1973 /// \headerfile <x86intrin.h> 1974 /// 1975 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 1976 /// 1977 /// \param __p 1978 /// A pointer to a 128-bit memory location. The address of the memory 1979 /// location does not have to be aligned. 1980 /// \param __a 1981 /// A 128-bit vector of [4 x float] containing the values to be stored. 1982 static __inline__ void __DEFAULT_FN_ATTRS 1983 _mm_storeu_ps(float *__p, __m128 __a) 1984 { 1985 struct __storeu_ps { 1986 __m128_u __v; 1987 } __attribute__((__packed__, __may_alias__)); 1988 ((struct __storeu_ps*)__p)->__v = __a; 1989 } 1990 1991 /// Stores a 128-bit vector of [4 x float] into an aligned memory 1992 /// location. 1993 /// 1994 /// \headerfile <x86intrin.h> 1995 /// 1996 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 1997 /// 1998 /// \param __p 1999 /// A pointer to a 128-bit memory location. The address of the memory 2000 /// location has to be 16-byte aligned. 2001 /// \param __a 2002 /// A 128-bit vector of [4 x float] containing the values to be stored. 2003 static __inline__ void __DEFAULT_FN_ATTRS 2004 _mm_store_ps(float *__p, __m128 __a) 2005 { 2006 *(__m128*)__p = __a; 2007 } 2008 2009 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2010 /// four contiguous elements in an aligned memory location. 2011 /// 2012 /// \headerfile <x86intrin.h> 2013 /// 2014 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2015 /// instruction. 2016 /// 2017 /// \param __p 2018 /// A pointer to a 128-bit memory location. 2019 /// \param __a 2020 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2021 /// of the four contiguous elements pointed by \a __p. 2022 static __inline__ void __DEFAULT_FN_ATTRS 2023 _mm_store1_ps(float *__p, __m128 __a) 2024 { 2025 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 2026 _mm_store_ps(__p, __a); 2027 } 2028 2029 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into 2030 /// four contiguous elements in an aligned memory location. 2031 /// 2032 /// \headerfile <x86intrin.h> 2033 /// 2034 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> 2035 /// instruction. 2036 /// 2037 /// \param __p 2038 /// A pointer to a 128-bit memory location. 2039 /// \param __a 2040 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each 2041 /// of the four contiguous elements pointed by \a __p. 2042 static __inline__ void __DEFAULT_FN_ATTRS 2043 _mm_store_ps1(float *__p, __m128 __a) 2044 { 2045 _mm_store1_ps(__p, __a); 2046 } 2047 2048 /// Stores float values from a 128-bit vector of [4 x float] to an 2049 /// aligned memory location in reverse order. 2050 /// 2051 /// \headerfile <x86intrin.h> 2052 /// 2053 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> 2054 /// instruction. 2055 /// 2056 /// \param __p 2057 /// A pointer to a 128-bit memory location. The address of the memory 2058 /// location has to be 128-bit aligned. 2059 /// \param __a 2060 /// A 128-bit vector of [4 x float] containing the values to be stored. 2061 static __inline__ void __DEFAULT_FN_ATTRS 2062 _mm_storer_ps(float *__p, __m128 __a) 2063 { 2064 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 2065 _mm_store_ps(__p, __a); 2066 } 2067 2068 #define _MM_HINT_ET0 7 2069 #define _MM_HINT_ET1 6 2070 #define _MM_HINT_T0 3 2071 #define _MM_HINT_T1 2 2072 #define _MM_HINT_T2 1 2073 #define _MM_HINT_NTA 0 2074 2075 #ifndef _MSC_VER 2076 /* FIXME: We have to #define this because "sel" must be a constant integer, and 2077 Sema doesn't do any form of constant propagation yet. */ 2078 2079 /// Loads one cache line of data from the specified address to a location 2080 /// closer to the processor. 2081 /// 2082 /// \headerfile <x86intrin.h> 2083 /// 2084 /// \code 2085 /// void _mm_prefetch(const void * a, const int sel); 2086 /// \endcode 2087 /// 2088 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction. 2089 /// 2090 /// \param a 2091 /// A pointer to a memory location containing a cache line of data. 2092 /// \param sel 2093 /// A predefined integer constant specifying the type of prefetch 2094 /// operation: \n 2095 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The 2096 /// PREFETCHNTA instruction will be generated. \n 2097 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will 2098 /// be generated. \n 2099 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will 2100 /// be generated. \n 2101 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will 2102 /// be generated. 2103 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \ 2104 ((sel) >> 2) & 1, (sel) & 0x3)) 2105 #endif 2106 2107 /// Stores a 64-bit integer in the specified aligned memory location. To 2108 /// minimize caching, the data is flagged as non-temporal (unlikely to be 2109 /// used again soon). 2110 /// 2111 /// \headerfile <x86intrin.h> 2112 /// 2113 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction. 2114 /// 2115 /// \param __p 2116 /// A pointer to an aligned memory location used to store the register value. 2117 /// \param __a 2118 /// A 64-bit integer containing the value to be stored. 2119 static __inline__ void __DEFAULT_FN_ATTRS_MMX 2120 _mm_stream_pi(__m64 *__p, __m64 __a) 2121 { 2122 __builtin_ia32_movntq(__p, __a); 2123 } 2124 2125 /// Moves packed float values from a 128-bit vector of [4 x float] to a 2126 /// 128-bit aligned memory location. To minimize caching, the data is flagged 2127 /// as non-temporal (unlikely to be used again soon). 2128 /// 2129 /// \headerfile <x86intrin.h> 2130 /// 2131 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 2132 /// 2133 /// \param __p 2134 /// A pointer to a 128-bit aligned memory location that will receive the 2135 /// single-precision floating-point values. 2136 /// \param __a 2137 /// A 128-bit vector of [4 x float] containing the values to be moved. 2138 static __inline__ void __DEFAULT_FN_ATTRS 2139 _mm_stream_ps(float *__p, __m128 __a) 2140 { 2141 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); 2142 } 2143 2144 #if defined(__cplusplus) 2145 extern "C" { 2146 #endif 2147 2148 /// Forces strong memory ordering (serialization) between store 2149 /// instructions preceding this instruction and store instructions following 2150 /// this instruction, ensuring the system completes all previous stores 2151 /// before executing subsequent stores. 2152 /// 2153 /// \headerfile <x86intrin.h> 2154 /// 2155 /// This intrinsic corresponds to the <c> SFENCE </c> instruction. 2156 /// 2157 void _mm_sfence(void); 2158 2159 #if defined(__cplusplus) 2160 } // extern "C" 2161 #endif 2162 2163 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and 2164 /// returns it, as specified by the immediate integer operand. 2165 /// 2166 /// \headerfile <x86intrin.h> 2167 /// 2168 /// \code 2169 /// int _mm_extract_pi16(__m64 a, int n); 2170 /// \endcode 2171 /// 2172 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 2173 /// 2174 /// \param a 2175 /// A 64-bit vector of [4 x i16]. 2176 /// \param n 2177 /// An immediate integer operand that determines which bits are extracted: \n 2178 /// 0: Bits [15:0] are copied to the destination. \n 2179 /// 1: Bits [31:16] are copied to the destination. \n 2180 /// 2: Bits [47:32] are copied to the destination. \n 2181 /// 3: Bits [63:48] are copied to the destination. 2182 /// \returns A 16-bit integer containing the extracted 16 bits of packed data. 2183 #define _mm_extract_pi16(a, n) \ 2184 (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n) 2185 2186 /// Copies data from the 64-bit vector of [4 x i16] to the destination, 2187 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset 2188 /// specified by the immediate operand \a n. 2189 /// 2190 /// \headerfile <x86intrin.h> 2191 /// 2192 /// \code 2193 /// __m64 _mm_insert_pi16(__m64 a, int d, int n); 2194 /// \endcode 2195 /// 2196 /// This intrinsic corresponds to the <c> PINSRW </c> instruction. 2197 /// 2198 /// \param a 2199 /// A 64-bit vector of [4 x i16]. 2200 /// \param d 2201 /// An integer. The lower 16-bit value from this operand is written to the 2202 /// destination at the offset specified by operand \a n. 2203 /// \param n 2204 /// An immediate integer operant that determines which the bits to be used 2205 /// in the destination. \n 2206 /// 0: Bits [15:0] are copied to the destination. \n 2207 /// 1: Bits [31:16] are copied to the destination. \n 2208 /// 2: Bits [47:32] are copied to the destination. \n 2209 /// 3: Bits [63:48] are copied to the destination. \n 2210 /// The remaining bits in the destination are copied from the corresponding 2211 /// bits in operand \a a. 2212 /// \returns A 64-bit integer vector containing the copied packed data from the 2213 /// operands. 2214 #define _mm_insert_pi16(a, d, n) \ 2215 (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n) 2216 2217 /// Compares each of the corresponding packed 16-bit integer values of 2218 /// the 64-bit integer vectors, and writes the greater value to the 2219 /// corresponding bits in the destination. 2220 /// 2221 /// \headerfile <x86intrin.h> 2222 /// 2223 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction. 2224 /// 2225 /// \param __a 2226 /// A 64-bit integer vector containing one of the source operands. 2227 /// \param __b 2228 /// A 64-bit integer vector containing one of the source operands. 2229 /// \returns A 64-bit integer vector containing the comparison results. 2230 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2231 _mm_max_pi16(__m64 __a, __m64 __b) 2232 { 2233 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 2234 } 2235 2236 /// Compares each of the corresponding packed 8-bit unsigned integer 2237 /// values of the 64-bit integer vectors, and writes the greater value to the 2238 /// corresponding bits in the destination. 2239 /// 2240 /// \headerfile <x86intrin.h> 2241 /// 2242 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction. 2243 /// 2244 /// \param __a 2245 /// A 64-bit integer vector containing one of the source operands. 2246 /// \param __b 2247 /// A 64-bit integer vector containing one of the source operands. 2248 /// \returns A 64-bit integer vector containing the comparison results. 2249 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2250 _mm_max_pu8(__m64 __a, __m64 __b) 2251 { 2252 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 2253 } 2254 2255 /// Compares each of the corresponding packed 16-bit integer values of 2256 /// the 64-bit integer vectors, and writes the lesser value to the 2257 /// corresponding bits in the destination. 2258 /// 2259 /// \headerfile <x86intrin.h> 2260 /// 2261 /// This intrinsic corresponds to the <c> PMINSW </c> instruction. 2262 /// 2263 /// \param __a 2264 /// A 64-bit integer vector containing one of the source operands. 2265 /// \param __b 2266 /// A 64-bit integer vector containing one of the source operands. 2267 /// \returns A 64-bit integer vector containing the comparison results. 2268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2269 _mm_min_pi16(__m64 __a, __m64 __b) 2270 { 2271 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 2272 } 2273 2274 /// Compares each of the corresponding packed 8-bit unsigned integer 2275 /// values of the 64-bit integer vectors, and writes the lesser value to the 2276 /// corresponding bits in the destination. 2277 /// 2278 /// \headerfile <x86intrin.h> 2279 /// 2280 /// This intrinsic corresponds to the <c> PMINUB </c> instruction. 2281 /// 2282 /// \param __a 2283 /// A 64-bit integer vector containing one of the source operands. 2284 /// \param __b 2285 /// A 64-bit integer vector containing one of the source operands. 2286 /// \returns A 64-bit integer vector containing the comparison results. 2287 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2288 _mm_min_pu8(__m64 __a, __m64 __b) 2289 { 2290 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 2291 } 2292 2293 /// Takes the most significant bit from each 8-bit element in a 64-bit 2294 /// integer vector to create an 8-bit mask value. Zero-extends the value to 2295 /// 32-bit integer and writes it to the destination. 2296 /// 2297 /// \headerfile <x86intrin.h> 2298 /// 2299 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction. 2300 /// 2301 /// \param __a 2302 /// A 64-bit integer vector containing the values with bits to be extracted. 2303 /// \returns The most significant bit from each 8-bit element in \a __a, 2304 /// written to bits [7:0]. 2305 static __inline__ int __DEFAULT_FN_ATTRS_MMX 2306 _mm_movemask_pi8(__m64 __a) 2307 { 2308 return __builtin_ia32_pmovmskb((__v8qi)__a); 2309 } 2310 2311 /// Multiplies packed 16-bit unsigned integer values and writes the 2312 /// high-order 16 bits of each 32-bit product to the corresponding bits in 2313 /// the destination. 2314 /// 2315 /// \headerfile <x86intrin.h> 2316 /// 2317 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction. 2318 /// 2319 /// \param __a 2320 /// A 64-bit integer vector containing one of the source operands. 2321 /// \param __b 2322 /// A 64-bit integer vector containing one of the source operands. 2323 /// \returns A 64-bit integer vector containing the products of both operands. 2324 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2325 _mm_mulhi_pu16(__m64 __a, __m64 __b) 2326 { 2327 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 2328 } 2329 2330 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the 2331 /// destination, as specified by the immediate value operand. 2332 /// 2333 /// \headerfile <x86intrin.h> 2334 /// 2335 /// \code 2336 /// __m64 _mm_shuffle_pi16(__m64 a, const int n); 2337 /// \endcode 2338 /// 2339 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction. 2340 /// 2341 /// \param a 2342 /// A 64-bit integer vector containing the values to be shuffled. 2343 /// \param n 2344 /// An immediate value containing an 8-bit value specifying which elements to 2345 /// copy from \a a. The destinations within the 64-bit destination are 2346 /// assigned values as follows: \n 2347 /// Bits [1:0] are used to assign values to bits [15:0] in the 2348 /// destination. \n 2349 /// Bits [3:2] are used to assign values to bits [31:16] in the 2350 /// destination. \n 2351 /// Bits [5:4] are used to assign values to bits [47:32] in the 2352 /// destination. \n 2353 /// Bits [7:6] are used to assign values to bits [63:48] in the 2354 /// destination. \n 2355 /// Bit value assignments: \n 2356 /// 00: assigned from bits [15:0] of \a a. \n 2357 /// 01: assigned from bits [31:16] of \a a. \n 2358 /// 10: assigned from bits [47:32] of \a a. \n 2359 /// 11: assigned from bits [63:48] of \a a. 2360 /// \returns A 64-bit integer vector containing the shuffled values. 2361 #define _mm_shuffle_pi16(a, n) \ 2362 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) 2363 2364 /// Conditionally copies the values from each 8-bit element in the first 2365 /// 64-bit integer vector operand to the specified memory location, as 2366 /// specified by the most significant bit in the corresponding element in the 2367 /// second 64-bit integer vector operand. 2368 /// 2369 /// To minimize caching, the data is flagged as non-temporal 2370 /// (unlikely to be used again soon). 2371 /// 2372 /// \headerfile <x86intrin.h> 2373 /// 2374 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction. 2375 /// 2376 /// \param __d 2377 /// A 64-bit integer vector containing the values with elements to be copied. 2378 /// \param __n 2379 /// A 64-bit integer vector operand. The most significant bit from each 8-bit 2380 /// element determines whether the corresponding element in operand \a __d 2381 /// is copied. If the most significant bit of a given element is 1, the 2382 /// corresponding element in operand \a __d is copied. 2383 /// \param __p 2384 /// A pointer to a 64-bit memory location that will receive the conditionally 2385 /// copied integer values. The address of the memory location does not have 2386 /// to be aligned. 2387 static __inline__ void __DEFAULT_FN_ATTRS_MMX 2388 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 2389 { 2390 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 2391 } 2392 2393 /// Computes the rounded averages of the packed unsigned 8-bit integer 2394 /// values and writes the averages to the corresponding bits in the 2395 /// destination. 2396 /// 2397 /// \headerfile <x86intrin.h> 2398 /// 2399 /// This intrinsic corresponds to the <c> PAVGB </c> instruction. 2400 /// 2401 /// \param __a 2402 /// A 64-bit integer vector containing one of the source operands. 2403 /// \param __b 2404 /// A 64-bit integer vector containing one of the source operands. 2405 /// \returns A 64-bit integer vector containing the averages of both operands. 2406 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2407 _mm_avg_pu8(__m64 __a, __m64 __b) 2408 { 2409 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 2410 } 2411 2412 /// Computes the rounded averages of the packed unsigned 16-bit integer 2413 /// values and writes the averages to the corresponding bits in the 2414 /// destination. 2415 /// 2416 /// \headerfile <x86intrin.h> 2417 /// 2418 /// This intrinsic corresponds to the <c> PAVGW </c> instruction. 2419 /// 2420 /// \param __a 2421 /// A 64-bit integer vector containing one of the source operands. 2422 /// \param __b 2423 /// A 64-bit integer vector containing one of the source operands. 2424 /// \returns A 64-bit integer vector containing the averages of both operands. 2425 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2426 _mm_avg_pu16(__m64 __a, __m64 __b) 2427 { 2428 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 2429 } 2430 2431 /// Subtracts the corresponding 8-bit unsigned integer values of the two 2432 /// 64-bit vector operands and computes the absolute value for each of the 2433 /// difference. Then sum of the 8 absolute differences is written to the 2434 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared. 2435 /// 2436 /// \headerfile <x86intrin.h> 2437 /// 2438 /// This intrinsic corresponds to the <c> PSADBW </c> instruction. 2439 /// 2440 /// \param __a 2441 /// A 64-bit integer vector containing one of the source operands. 2442 /// \param __b 2443 /// A 64-bit integer vector containing one of the source operands. 2444 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the 2445 /// sets of absolute differences between both operands. The upper bits are 2446 /// cleared. 2447 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2448 _mm_sad_pu8(__m64 __a, __m64 __b) 2449 { 2450 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 2451 } 2452 2453 #if defined(__cplusplus) 2454 extern "C" { 2455 #endif 2456 2457 /// Returns the contents of the MXCSR register as a 32-bit unsigned 2458 /// integer value. 2459 /// 2460 /// There are several groups of macros associated with this 2461 /// intrinsic, including: 2462 /// <ul> 2463 /// <li> 2464 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2465 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2466 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2467 /// _MM_GET_EXCEPTION_STATE(). 2468 /// </li> 2469 /// <li> 2470 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2471 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2472 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). 2473 /// </li> 2474 /// <li> 2475 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2476 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2477 /// _MM_GET_ROUNDING_MODE(). 2478 /// </li> 2479 /// <li> 2480 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2481 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). 2482 /// </li> 2483 /// <li> 2484 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2485 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2486 /// _MM_GET_DENORMALS_ZERO_MODE(). 2487 /// </li> 2488 /// </ul> 2489 /// 2490 /// For example, the following expression checks if an overflow exception has 2491 /// occurred: 2492 /// \code 2493 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) 2494 /// \endcode 2495 /// 2496 /// The following expression gets the current rounding mode: 2497 /// \code 2498 /// _MM_GET_ROUNDING_MODE() 2499 /// \endcode 2500 /// 2501 /// \headerfile <x86intrin.h> 2502 /// 2503 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction. 2504 /// 2505 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR 2506 /// register. 2507 unsigned int _mm_getcsr(void); 2508 2509 /// Sets the MXCSR register with the 32-bit unsigned integer value. 2510 /// 2511 /// There are several groups of macros associated with this intrinsic, 2512 /// including: 2513 /// <ul> 2514 /// <li> 2515 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, 2516 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, 2517 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper 2518 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. 2519 /// </li> 2520 /// <li> 2521 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, 2522 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. 2523 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one 2524 /// of these macros. 2525 /// </li> 2526 /// <li> 2527 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, 2528 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper 2529 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. 2530 /// </li> 2531 /// <li> 2532 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. 2533 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is 2534 /// one of these macros. 2535 /// </li> 2536 /// <li> 2537 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, 2538 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper 2539 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. 2540 /// </li> 2541 /// </ul> 2542 /// 2543 /// For example, the following expression causes subsequent floating-point 2544 /// operations to round up: 2545 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) 2546 /// 2547 /// The following example sets the DAZ and FTZ flags: 2548 /// \code 2549 /// void setFlags() { 2550 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); 2551 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); 2552 /// } 2553 /// \endcode 2554 /// 2555 /// \headerfile <x86intrin.h> 2556 /// 2557 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction. 2558 /// 2559 /// \param __i 2560 /// A 32-bit unsigned integer value to be written to the MXCSR register. 2561 void _mm_setcsr(unsigned int __i); 2562 2563 #if defined(__cplusplus) 2564 } // extern "C" 2565 #endif 2566 2567 /// Selects 4 float values from the 128-bit operands of [4 x float], as 2568 /// specified by the immediate value operand. 2569 /// 2570 /// \headerfile <x86intrin.h> 2571 /// 2572 /// \code 2573 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); 2574 /// \endcode 2575 /// 2576 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction. 2577 /// 2578 /// \param a 2579 /// A 128-bit vector of [4 x float]. 2580 /// \param b 2581 /// A 128-bit vector of [4 x float]. 2582 /// \param mask 2583 /// An immediate value containing an 8-bit value specifying which elements to 2584 /// copy from \a a and \a b. \n 2585 /// Bits [3:0] specify the values copied from operand \a a. \n 2586 /// Bits [7:4] specify the values copied from operand \a b. \n 2587 /// The destinations within the 128-bit destination are assigned values as 2588 /// follows: \n 2589 /// Bits [1:0] are used to assign values to bits [31:0] in the 2590 /// destination. \n 2591 /// Bits [3:2] are used to assign values to bits [63:32] in the 2592 /// destination. \n 2593 /// Bits [5:4] are used to assign values to bits [95:64] in the 2594 /// destination. \n 2595 /// Bits [7:6] are used to assign values to bits [127:96] in the 2596 /// destination. \n 2597 /// Bit value assignments: \n 2598 /// 00: Bits [31:0] copied from the specified operand. \n 2599 /// 01: Bits [63:32] copied from the specified operand. \n 2600 /// 10: Bits [95:64] copied from the specified operand. \n 2601 /// 11: Bits [127:96] copied from the specified operand. 2602 /// \returns A 128-bit vector of [4 x float] containing the shuffled values. 2603 #define _mm_shuffle_ps(a, b, mask) \ 2604 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 2605 (int)(mask)) 2606 2607 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 2608 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2609 /// 2610 /// \headerfile <x86intrin.h> 2611 /// 2612 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction. 2613 /// 2614 /// \param __a 2615 /// A 128-bit vector of [4 x float]. \n 2616 /// Bits [95:64] are written to bits [31:0] of the destination. \n 2617 /// Bits [127:96] are written to bits [95:64] of the destination. 2618 /// \param __b 2619 /// A 128-bit vector of [4 x float]. 2620 /// Bits [95:64] are written to bits [63:32] of the destination. \n 2621 /// Bits [127:96] are written to bits [127:96] of the destination. 2622 /// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2623 static __inline__ __m128 __DEFAULT_FN_ATTRS 2624 _mm_unpackhi_ps(__m128 __a, __m128 __b) 2625 { 2626 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); 2627 } 2628 2629 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 2630 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. 2631 /// 2632 /// \headerfile <x86intrin.h> 2633 /// 2634 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction. 2635 /// 2636 /// \param __a 2637 /// A 128-bit vector of [4 x float]. \n 2638 /// Bits [31:0] are written to bits [31:0] of the destination. \n 2639 /// Bits [63:32] are written to bits [95:64] of the destination. 2640 /// \param __b 2641 /// A 128-bit vector of [4 x float]. \n 2642 /// Bits [31:0] are written to bits [63:32] of the destination. \n 2643 /// Bits [63:32] are written to bits [127:96] of the destination. 2644 /// \returns A 128-bit vector of [4 x float] containing the interleaved values. 2645 static __inline__ __m128 __DEFAULT_FN_ATTRS 2646 _mm_unpacklo_ps(__m128 __a, __m128 __b) 2647 { 2648 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); 2649 } 2650 2651 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2652 /// 32 bits are set to the lower 32 bits of the second parameter. The upper 2653 /// 96 bits are set to the upper 96 bits of the first parameter. 2654 /// 2655 /// \headerfile <x86intrin.h> 2656 /// 2657 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c> 2658 /// instruction. 2659 /// 2660 /// \param __a 2661 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are 2662 /// written to the upper 96 bits of the result. 2663 /// \param __b 2664 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are 2665 /// written to the lower 32 bits of the result. 2666 /// \returns A 128-bit floating-point vector of [4 x float]. 2667 static __inline__ __m128 __DEFAULT_FN_ATTRS 2668 _mm_move_ss(__m128 __a, __m128 __b) 2669 { 2670 __a[0] = __b[0]; 2671 return __a; 2672 } 2673 2674 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2675 /// 64 bits are set to the upper 64 bits of the second parameter. The upper 2676 /// 64 bits are set to the upper 64 bits of the first parameter. 2677 /// 2678 /// \headerfile <x86intrin.h> 2679 /// 2680 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 2681 /// 2682 /// \param __a 2683 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2684 /// written to the upper 64 bits of the result. 2685 /// \param __b 2686 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are 2687 /// written to the lower 64 bits of the result. 2688 /// \returns A 128-bit floating-point vector of [4 x float]. 2689 static __inline__ __m128 __DEFAULT_FN_ATTRS 2690 _mm_movehl_ps(__m128 __a, __m128 __b) 2691 { 2692 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); 2693 } 2694 2695 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower 2696 /// 64 bits are set to the lower 64 bits of the first parameter. The upper 2697 /// 64 bits are set to the lower 64 bits of the second parameter. 2698 /// 2699 /// \headerfile <x86intrin.h> 2700 /// 2701 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 2702 /// 2703 /// \param __a 2704 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2705 /// written to the lower 64 bits of the result. 2706 /// \param __b 2707 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are 2708 /// written to the upper 64 bits of the result. 2709 /// \returns A 128-bit floating-point vector of [4 x float]. 2710 static __inline__ __m128 __DEFAULT_FN_ATTRS 2711 _mm_movelh_ps(__m128 __a, __m128 __b) 2712 { 2713 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); 2714 } 2715 2716 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x 2717 /// float]. 2718 /// 2719 /// \headerfile <x86intrin.h> 2720 /// 2721 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2722 /// 2723 /// \param __a 2724 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied 2725 /// from the corresponding elements in this operand. 2726 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2727 /// values from the operand. 2728 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2729 _mm_cvtpi16_ps(__m64 __a) 2730 { 2731 __m64 __b, __c; 2732 __m128 __r; 2733 2734 __b = _mm_setzero_si64(); 2735 __b = _mm_cmpgt_pi16(__b, __a); 2736 __c = _mm_unpackhi_pi16(__a, __b); 2737 __r = _mm_setzero_ps(); 2738 __r = _mm_cvtpi32_ps(__r, __c); 2739 __r = _mm_movelh_ps(__r, __r); 2740 __c = _mm_unpacklo_pi16(__a, __b); 2741 __r = _mm_cvtpi32_ps(__r, __c); 2742 2743 return __r; 2744 } 2745 2746 /// Converts a 64-bit vector of 16-bit unsigned integer values into a 2747 /// 128-bit vector of [4 x float]. 2748 /// 2749 /// \headerfile <x86intrin.h> 2750 /// 2751 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2752 /// 2753 /// \param __a 2754 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the 2755 /// destination are copied from the corresponding elements in this operand. 2756 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2757 /// values from the operand. 2758 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2759 _mm_cvtpu16_ps(__m64 __a) 2760 { 2761 __m64 __b, __c; 2762 __m128 __r; 2763 2764 __b = _mm_setzero_si64(); 2765 __c = _mm_unpackhi_pi16(__a, __b); 2766 __r = _mm_setzero_ps(); 2767 __r = _mm_cvtpi32_ps(__r, __c); 2768 __r = _mm_movelh_ps(__r, __r); 2769 __c = _mm_unpacklo_pi16(__a, __b); 2770 __r = _mm_cvtpi32_ps(__r, __c); 2771 2772 return __r; 2773 } 2774 2775 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] 2776 /// into a 128-bit vector of [4 x float]. 2777 /// 2778 /// \headerfile <x86intrin.h> 2779 /// 2780 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2781 /// 2782 /// \param __a 2783 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied 2784 /// from the corresponding lower 4 elements in this operand. 2785 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2786 /// values from the operand. 2787 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2788 _mm_cvtpi8_ps(__m64 __a) 2789 { 2790 __m64 __b; 2791 2792 __b = _mm_setzero_si64(); 2793 __b = _mm_cmpgt_pi8(__b, __a); 2794 __b = _mm_unpacklo_pi8(__a, __b); 2795 2796 return _mm_cvtpi16_ps(__b); 2797 } 2798 2799 /// Converts the lower four unsigned 8-bit integer values from a 64-bit 2800 /// vector of [8 x u8] into a 128-bit vector of [4 x float]. 2801 /// 2802 /// \headerfile <x86intrin.h> 2803 /// 2804 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2805 /// 2806 /// \param __a 2807 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the 2808 /// destination are copied from the corresponding lower 4 elements in this 2809 /// operand. 2810 /// \returns A 128-bit vector of [4 x float] containing the copied and converted 2811 /// values from the source operand. 2812 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2813 _mm_cvtpu8_ps(__m64 __a) 2814 { 2815 __m64 __b; 2816 2817 __b = _mm_setzero_si64(); 2818 __b = _mm_unpacklo_pi8(__a, __b); 2819 2820 return _mm_cvtpi16_ps(__b); 2821 } 2822 2823 /// Converts the two 32-bit signed integer values from each 64-bit vector 2824 /// operand of [2 x i32] into a 128-bit vector of [4 x float]. 2825 /// 2826 /// \headerfile <x86intrin.h> 2827 /// 2828 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. 2829 /// 2830 /// \param __a 2831 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are 2832 /// copied from the elements in this operand. 2833 /// \param __b 2834 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are 2835 /// copied from the elements in this operand. 2836 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 2837 /// copied and converted values from the first operand. The upper 64 bits 2838 /// contain the copied and converted values from the second operand. 2839 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX 2840 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 2841 { 2842 __m128 __c; 2843 2844 __c = _mm_setzero_ps(); 2845 __c = _mm_cvtpi32_ps(__c, __b); 2846 __c = _mm_movelh_ps(__c, __c); 2847 2848 return _mm_cvtpi32_ps(__c, __a); 2849 } 2850 2851 /// Converts each single-precision floating-point element of a 128-bit 2852 /// floating-point vector of [4 x float] into a 16-bit signed integer, and 2853 /// packs the results into a 64-bit integer vector of [4 x i16]. 2854 /// 2855 /// If the floating-point element is NaN or infinity, or if the 2856 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, 2857 /// it is converted to 0x8000. Otherwise if the floating-point element is 2858 /// greater than 0x7FFF, it is converted to 0x7FFF. 2859 /// 2860 /// \headerfile <x86intrin.h> 2861 /// 2862 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2863 /// 2864 /// \param __a 2865 /// A 128-bit floating-point vector of [4 x float]. 2866 /// \returns A 64-bit integer vector of [4 x i16] containing the converted 2867 /// values. 2868 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2869 _mm_cvtps_pi16(__m128 __a) 2870 { 2871 __m64 __b, __c; 2872 2873 __b = _mm_cvtps_pi32(__a); 2874 __a = _mm_movehl_ps(__a, __a); 2875 __c = _mm_cvtps_pi32(__a); 2876 2877 return _mm_packs_pi32(__b, __c); 2878 } 2879 2880 /// Converts each single-precision floating-point element of a 128-bit 2881 /// floating-point vector of [4 x float] into an 8-bit signed integer, and 2882 /// packs the results into the lower 32 bits of a 64-bit integer vector of 2883 /// [8 x i8]. The upper 32 bits of the vector are set to 0. 2884 /// 2885 /// If the floating-point element is NaN or infinity, or if the 2886 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it 2887 /// is converted to 0x80. Otherwise if the floating-point element is greater 2888 /// than 0x7F, it is converted to 0x7F. 2889 /// 2890 /// \headerfile <x86intrin.h> 2891 /// 2892 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. 2893 /// 2894 /// \param __a 2895 /// 128-bit floating-point vector of [4 x float]. 2896 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the 2897 /// converted values and the uppper 32 bits are set to zero. 2898 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2899 _mm_cvtps_pi8(__m128 __a) 2900 { 2901 __m64 __b, __c; 2902 2903 __b = _mm_cvtps_pi16(__a); 2904 __c = _mm_setzero_si64(); 2905 2906 return _mm_packs_pi16(__b, __c); 2907 } 2908 2909 /// Extracts the sign bits from each single-precision floating-point 2910 /// element of a 128-bit floating-point vector of [4 x float] and returns the 2911 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set 2912 /// to zero. 2913 /// 2914 /// \headerfile <x86intrin.h> 2915 /// 2916 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction. 2917 /// 2918 /// \param __a 2919 /// A 128-bit floating-point vector of [4 x float]. 2920 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each 2921 /// single-precision floating-point element of the parameter. Bits [31:4] are 2922 /// set to zero. 2923 static __inline__ int __DEFAULT_FN_ATTRS 2924 _mm_movemask_ps(__m128 __a) 2925 { 2926 return __builtin_ia32_movmskps((__v4sf)__a); 2927 } 2928 2929 2930 #define _MM_ALIGN16 __attribute__((aligned(16))) 2931 2932 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 2933 2934 #define _MM_EXCEPT_INVALID (0x0001) 2935 #define _MM_EXCEPT_DENORM (0x0002) 2936 #define _MM_EXCEPT_DIV_ZERO (0x0004) 2937 #define _MM_EXCEPT_OVERFLOW (0x0008) 2938 #define _MM_EXCEPT_UNDERFLOW (0x0010) 2939 #define _MM_EXCEPT_INEXACT (0x0020) 2940 #define _MM_EXCEPT_MASK (0x003f) 2941 2942 #define _MM_MASK_INVALID (0x0080) 2943 #define _MM_MASK_DENORM (0x0100) 2944 #define _MM_MASK_DIV_ZERO (0x0200) 2945 #define _MM_MASK_OVERFLOW (0x0400) 2946 #define _MM_MASK_UNDERFLOW (0x0800) 2947 #define _MM_MASK_INEXACT (0x1000) 2948 #define _MM_MASK_MASK (0x1f80) 2949 2950 #define _MM_ROUND_NEAREST (0x0000) 2951 #define _MM_ROUND_DOWN (0x2000) 2952 #define _MM_ROUND_UP (0x4000) 2953 #define _MM_ROUND_TOWARD_ZERO (0x6000) 2954 #define _MM_ROUND_MASK (0x6000) 2955 2956 #define _MM_FLUSH_ZERO_MASK (0x8000) 2957 #define _MM_FLUSH_ZERO_ON (0x8000) 2958 #define _MM_FLUSH_ZERO_OFF (0x0000) 2959 2960 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 2961 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 2962 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 2963 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 2964 2965 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 2966 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 2967 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 2968 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 2969 2970 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2971 do { \ 2972 __m128 tmp3, tmp2, tmp1, tmp0; \ 2973 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 2974 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 2975 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 2976 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 2977 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 2978 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 2979 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 2980 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 2981 } while (0) 2982 2983 /* Aliases for compatibility. */ 2984 #define _m_pextrw _mm_extract_pi16 2985 #define _m_pinsrw _mm_insert_pi16 2986 #define _m_pmaxsw _mm_max_pi16 2987 #define _m_pmaxub _mm_max_pu8 2988 #define _m_pminsw _mm_min_pi16 2989 #define _m_pminub _mm_min_pu8 2990 #define _m_pmovmskb _mm_movemask_pi8 2991 #define _m_pmulhuw _mm_mulhi_pu16 2992 #define _m_pshufw _mm_shuffle_pi16 2993 #define _m_maskmovq _mm_maskmove_si64 2994 #define _m_pavgb _mm_avg_pu8 2995 #define _m_pavgw _mm_avg_pu16 2996 #define _m_psadbw _mm_sad_pu8 2997 #define _m_ _mm_ 2998 #define _m_ _mm_ 2999 3000 #undef __DEFAULT_FN_ATTRS 3001 #undef __DEFAULT_FN_ATTRS_MMX 3002 3003 /* Ugly hack for backwards-compatibility (compatible with gcc) */ 3004 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) 3005 #include <emmintrin.h> 3006 #endif 3007 3008 #endif /* __XMMINTRIN_H */ 3009