1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVXINTRIN_H 15 #define __AVXINTRIN_H 16 17 typedef double __v4df __attribute__ ((__vector_size__ (32))); 18 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 19 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 20 typedef int __v8si __attribute__ ((__vector_size__ (32))); 21 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 22 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 23 24 /* Unsigned types */ 25 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 26 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 27 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 28 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 29 30 /* We need an explicitly signed variant for char. Note that this shouldn't 31 * appear in the interface though. */ 32 typedef signed char __v32qs __attribute__((__vector_size__(32))); 33 34 typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32))); 35 typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); 36 typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); 37 38 typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1))); 39 typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1))); 40 typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); 41 42 #ifdef __SSE2__ 43 /* Both _Float16 and __bf16 require SSE2 being enabled. */ 44 typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); 45 typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); 46 typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); 47 48 typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32))); 49 typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); 50 #endif 51 52 /* Define the default attributes for the functions in this file. */ 53 #define __DEFAULT_FN_ATTRS \ 54 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \ 55 __min_vector_width__(256))) 56 #define __DEFAULT_FN_ATTRS128 \ 57 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \ 58 __min_vector_width__(128))) 59 60 /* Arithmetic */ 61 /// Adds two 256-bit vectors of [4 x double]. 62 /// 63 /// \headerfile <x86intrin.h> 64 /// 65 /// This intrinsic corresponds to the <c> VADDPD </c> instruction. 66 /// 67 /// \param __a 68 /// A 256-bit vector of [4 x double] containing one of the source operands. 69 /// \param __b 70 /// A 256-bit vector of [4 x double] containing one of the source operands. 71 /// \returns A 256-bit vector of [4 x double] containing the sums of both 72 /// operands. 73 static __inline __m256d __DEFAULT_FN_ATTRS 74 _mm256_add_pd(__m256d __a, __m256d __b) 75 { 76 return (__m256d)((__v4df)__a+(__v4df)__b); 77 } 78 79 /// Adds two 256-bit vectors of [8 x float]. 80 /// 81 /// \headerfile <x86intrin.h> 82 /// 83 /// This intrinsic corresponds to the <c> VADDPS </c> instruction. 84 /// 85 /// \param __a 86 /// A 256-bit vector of [8 x float] containing one of the source operands. 87 /// \param __b 88 /// A 256-bit vector of [8 x float] containing one of the source operands. 89 /// \returns A 256-bit vector of [8 x float] containing the sums of both 90 /// operands. 91 static __inline __m256 __DEFAULT_FN_ATTRS 92 _mm256_add_ps(__m256 __a, __m256 __b) 93 { 94 return (__m256)((__v8sf)__a+(__v8sf)__b); 95 } 96 97 /// Subtracts two 256-bit vectors of [4 x double]. 98 /// 99 /// \headerfile <x86intrin.h> 100 /// 101 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 102 /// 103 /// \param __a 104 /// A 256-bit vector of [4 x double] containing the minuend. 105 /// \param __b 106 /// A 256-bit vector of [4 x double] containing the subtrahend. 107 /// \returns A 256-bit vector of [4 x double] containing the differences between 108 /// both operands. 109 static __inline __m256d __DEFAULT_FN_ATTRS 110 _mm256_sub_pd(__m256d __a, __m256d __b) 111 { 112 return (__m256d)((__v4df)__a-(__v4df)__b); 113 } 114 115 /// Subtracts two 256-bit vectors of [8 x float]. 116 /// 117 /// \headerfile <x86intrin.h> 118 /// 119 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 120 /// 121 /// \param __a 122 /// A 256-bit vector of [8 x float] containing the minuend. 123 /// \param __b 124 /// A 256-bit vector of [8 x float] containing the subtrahend. 125 /// \returns A 256-bit vector of [8 x float] containing the differences between 126 /// both operands. 127 static __inline __m256 __DEFAULT_FN_ATTRS 128 _mm256_sub_ps(__m256 __a, __m256 __b) 129 { 130 return (__m256)((__v8sf)__a-(__v8sf)__b); 131 } 132 133 /// Adds the even-indexed values and subtracts the odd-indexed values of 134 /// two 256-bit vectors of [4 x double]. 135 /// 136 /// \headerfile <x86intrin.h> 137 /// 138 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 139 /// 140 /// \param __a 141 /// A 256-bit vector of [4 x double] containing the left source operand. 142 /// \param __b 143 /// A 256-bit vector of [4 x double] containing the right source operand. 144 /// \returns A 256-bit vector of [4 x double] containing the alternating sums 145 /// and differences between both operands. 146 static __inline __m256d __DEFAULT_FN_ATTRS 147 _mm256_addsub_pd(__m256d __a, __m256d __b) 148 { 149 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 150 } 151 152 /// Adds the even-indexed values and subtracts the odd-indexed values of 153 /// two 256-bit vectors of [8 x float]. 154 /// 155 /// \headerfile <x86intrin.h> 156 /// 157 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 158 /// 159 /// \param __a 160 /// A 256-bit vector of [8 x float] containing the left source operand. 161 /// \param __b 162 /// A 256-bit vector of [8 x float] containing the right source operand. 163 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and 164 /// differences between both operands. 165 static __inline __m256 __DEFAULT_FN_ATTRS 166 _mm256_addsub_ps(__m256 __a, __m256 __b) 167 { 168 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 169 } 170 171 /// Divides two 256-bit vectors of [4 x double]. 172 /// 173 /// \headerfile <x86intrin.h> 174 /// 175 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 176 /// 177 /// \param __a 178 /// A 256-bit vector of [4 x double] containing the dividend. 179 /// \param __b 180 /// A 256-bit vector of [4 x double] containing the divisor. 181 /// \returns A 256-bit vector of [4 x double] containing the quotients of both 182 /// operands. 183 static __inline __m256d __DEFAULT_FN_ATTRS 184 _mm256_div_pd(__m256d __a, __m256d __b) 185 { 186 return (__m256d)((__v4df)__a/(__v4df)__b); 187 } 188 189 /// Divides two 256-bit vectors of [8 x float]. 190 /// 191 /// \headerfile <x86intrin.h> 192 /// 193 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 194 /// 195 /// \param __a 196 /// A 256-bit vector of [8 x float] containing the dividend. 197 /// \param __b 198 /// A 256-bit vector of [8 x float] containing the divisor. 199 /// \returns A 256-bit vector of [8 x float] containing the quotients of both 200 /// operands. 201 static __inline __m256 __DEFAULT_FN_ATTRS 202 _mm256_div_ps(__m256 __a, __m256 __b) 203 { 204 return (__m256)((__v8sf)__a/(__v8sf)__b); 205 } 206 207 /// Compares two 256-bit vectors of [4 x double] and returns the greater 208 /// of each pair of values. 209 /// 210 /// If either value in a comparison is NaN, returns the value from \a __b. 211 /// 212 /// \headerfile <x86intrin.h> 213 /// 214 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 215 /// 216 /// \param __a 217 /// A 256-bit vector of [4 x double] containing one of the operands. 218 /// \param __b 219 /// A 256-bit vector of [4 x double] containing one of the operands. 220 /// \returns A 256-bit vector of [4 x double] containing the maximum values 221 /// between both operands. 222 static __inline __m256d __DEFAULT_FN_ATTRS 223 _mm256_max_pd(__m256d __a, __m256d __b) 224 { 225 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 226 } 227 228 /// Compares two 256-bit vectors of [8 x float] and returns the greater 229 /// of each pair of values. 230 /// 231 /// If either value in a comparison is NaN, returns the value from \a __b. 232 /// 233 /// \headerfile <x86intrin.h> 234 /// 235 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 236 /// 237 /// \param __a 238 /// A 256-bit vector of [8 x float] containing one of the operands. 239 /// \param __b 240 /// A 256-bit vector of [8 x float] containing one of the operands. 241 /// \returns A 256-bit vector of [8 x float] containing the maximum values 242 /// between both operands. 243 static __inline __m256 __DEFAULT_FN_ATTRS 244 _mm256_max_ps(__m256 __a, __m256 __b) 245 { 246 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 247 } 248 249 /// Compares two 256-bit vectors of [4 x double] and returns the lesser 250 /// of each pair of values. 251 /// 252 /// If either value in a comparison is NaN, returns the value from \a __b. 253 /// 254 /// \headerfile <x86intrin.h> 255 /// 256 /// This intrinsic corresponds to the <c> VMINPD </c> instruction. 257 /// 258 /// \param __a 259 /// A 256-bit vector of [4 x double] containing one of the operands. 260 /// \param __b 261 /// A 256-bit vector of [4 x double] containing one of the operands. 262 /// \returns A 256-bit vector of [4 x double] containing the minimum values 263 /// between both operands. 264 static __inline __m256d __DEFAULT_FN_ATTRS 265 _mm256_min_pd(__m256d __a, __m256d __b) 266 { 267 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 268 } 269 270 /// Compares two 256-bit vectors of [8 x float] and returns the lesser 271 /// of each pair of values. 272 /// 273 /// If either value in a comparison is NaN, returns the value from \a __b. 274 /// 275 /// \headerfile <x86intrin.h> 276 /// 277 /// This intrinsic corresponds to the <c> VMINPS </c> instruction. 278 /// 279 /// \param __a 280 /// A 256-bit vector of [8 x float] containing one of the operands. 281 /// \param __b 282 /// A 256-bit vector of [8 x float] containing one of the operands. 283 /// \returns A 256-bit vector of [8 x float] containing the minimum values 284 /// between both operands. 285 static __inline __m256 __DEFAULT_FN_ATTRS 286 _mm256_min_ps(__m256 __a, __m256 __b) 287 { 288 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 289 } 290 291 /// Multiplies two 256-bit vectors of [4 x double]. 292 /// 293 /// \headerfile <x86intrin.h> 294 /// 295 /// This intrinsic corresponds to the <c> VMULPD </c> instruction. 296 /// 297 /// \param __a 298 /// A 256-bit vector of [4 x double] containing one of the operands. 299 /// \param __b 300 /// A 256-bit vector of [4 x double] containing one of the operands. 301 /// \returns A 256-bit vector of [4 x double] containing the products of both 302 /// operands. 303 static __inline __m256d __DEFAULT_FN_ATTRS 304 _mm256_mul_pd(__m256d __a, __m256d __b) 305 { 306 return (__m256d)((__v4df)__a * (__v4df)__b); 307 } 308 309 /// Multiplies two 256-bit vectors of [8 x float]. 310 /// 311 /// \headerfile <x86intrin.h> 312 /// 313 /// This intrinsic corresponds to the <c> VMULPS </c> instruction. 314 /// 315 /// \param __a 316 /// A 256-bit vector of [8 x float] containing one of the operands. 317 /// \param __b 318 /// A 256-bit vector of [8 x float] containing one of the operands. 319 /// \returns A 256-bit vector of [8 x float] containing the products of both 320 /// operands. 321 static __inline __m256 __DEFAULT_FN_ATTRS 322 _mm256_mul_ps(__m256 __a, __m256 __b) 323 { 324 return (__m256)((__v8sf)__a * (__v8sf)__b); 325 } 326 327 /// Calculates the square roots of the values in a 256-bit vector of 328 /// [4 x double]. 329 /// 330 /// \headerfile <x86intrin.h> 331 /// 332 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 333 /// 334 /// \param __a 335 /// A 256-bit vector of [4 x double]. 336 /// \returns A 256-bit vector of [4 x double] containing the square roots of the 337 /// values in the operand. 338 static __inline __m256d __DEFAULT_FN_ATTRS 339 _mm256_sqrt_pd(__m256d __a) 340 { 341 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 342 } 343 344 /// Calculates the square roots of the values in a 256-bit vector of 345 /// [8 x float]. 346 /// 347 /// \headerfile <x86intrin.h> 348 /// 349 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 350 /// 351 /// \param __a 352 /// A 256-bit vector of [8 x float]. 353 /// \returns A 256-bit vector of [8 x float] containing the square roots of the 354 /// values in the operand. 355 static __inline __m256 __DEFAULT_FN_ATTRS 356 _mm256_sqrt_ps(__m256 __a) 357 { 358 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 359 } 360 361 /// Calculates the reciprocal square roots of the values in a 256-bit 362 /// vector of [8 x float]. 363 /// 364 /// \headerfile <x86intrin.h> 365 /// 366 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 367 /// 368 /// \param __a 369 /// A 256-bit vector of [8 x float]. 370 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square 371 /// roots of the values in the operand. 372 static __inline __m256 __DEFAULT_FN_ATTRS 373 _mm256_rsqrt_ps(__m256 __a) 374 { 375 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 376 } 377 378 /// Calculates the reciprocals of the values in a 256-bit vector of 379 /// [8 x float]. 380 /// 381 /// \headerfile <x86intrin.h> 382 /// 383 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 384 /// 385 /// \param __a 386 /// A 256-bit vector of [8 x float]. 387 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 388 /// values in the operand. 389 static __inline __m256 __DEFAULT_FN_ATTRS 390 _mm256_rcp_ps(__m256 __a) 391 { 392 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 393 } 394 395 /// Rounds the values in a 256-bit vector of [4 x double] as specified 396 /// by the byte operand. The source values are rounded to integer values and 397 /// returned as 64-bit double-precision floating-point values. 398 /// 399 /// \headerfile <x86intrin.h> 400 /// 401 /// \code 402 /// __m256d _mm256_round_pd(__m256d V, const int M); 403 /// \endcode 404 /// 405 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 406 /// 407 /// \param V 408 /// A 256-bit vector of [4 x double]. 409 /// \param M 410 /// An integer value that specifies the rounding operation. \n 411 /// Bits [7:4] are reserved. \n 412 /// Bit [3] is a precision exception value: \n 413 /// 0: A normal PE exception is used. \n 414 /// 1: The PE field is not updated. \n 415 /// Bit [2] is the rounding control source: \n 416 /// 0: Use bits [1:0] of \a M. \n 417 /// 1: Use the current MXCSR setting. \n 418 /// Bits [1:0] contain the rounding control definition: \n 419 /// 00: Nearest. \n 420 /// 01: Downward (toward negative infinity). \n 421 /// 10: Upward (toward positive infinity). \n 422 /// 11: Truncated. 423 /// \returns A 256-bit vector of [4 x double] containing the rounded values. 424 #define _mm256_round_pd(V, M) \ 425 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))) 426 427 /// Rounds the values stored in a 256-bit vector of [8 x float] as 428 /// specified by the byte operand. The source values are rounded to integer 429 /// values and returned as floating-point values. 430 /// 431 /// \headerfile <x86intrin.h> 432 /// 433 /// \code 434 /// __m256 _mm256_round_ps(__m256 V, const int M); 435 /// \endcode 436 /// 437 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 438 /// 439 /// \param V 440 /// A 256-bit vector of [8 x float]. 441 /// \param M 442 /// An integer value that specifies the rounding operation. \n 443 /// Bits [7:4] are reserved. \n 444 /// Bit [3] is a precision exception value: \n 445 /// 0: A normal PE exception is used. \n 446 /// 1: The PE field is not updated. \n 447 /// Bit [2] is the rounding control source: \n 448 /// 0: Use bits [1:0] of \a M. \n 449 /// 1: Use the current MXCSR setting. \n 450 /// Bits [1:0] contain the rounding control definition: \n 451 /// 00: Nearest. \n 452 /// 01: Downward (toward negative infinity). \n 453 /// 10: Upward (toward positive infinity). \n 454 /// 11: Truncated. 455 /// \returns A 256-bit vector of [8 x float] containing the rounded values. 456 #define _mm256_round_ps(V, M) \ 457 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))) 458 459 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The 460 /// source values are rounded up to integer values and returned as 64-bit 461 /// double-precision floating-point values. 462 /// 463 /// \headerfile <x86intrin.h> 464 /// 465 /// \code 466 /// __m256d _mm256_ceil_pd(__m256d V); 467 /// \endcode 468 /// 469 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 470 /// 471 /// \param V 472 /// A 256-bit vector of [4 x double]. 473 /// \returns A 256-bit vector of [4 x double] containing the rounded up values. 474 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 475 476 /// Rounds down the values stored in a 256-bit vector of [4 x double]. 477 /// The source values are rounded down to integer values and returned as 478 /// 64-bit double-precision floating-point values. 479 /// 480 /// \headerfile <x86intrin.h> 481 /// 482 /// \code 483 /// __m256d _mm256_floor_pd(__m256d V); 484 /// \endcode 485 /// 486 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 487 /// 488 /// \param V 489 /// A 256-bit vector of [4 x double]. 490 /// \returns A 256-bit vector of [4 x double] containing the rounded down 491 /// values. 492 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 493 494 /// Rounds up the values stored in a 256-bit vector of [8 x float]. The 495 /// source values are rounded up to integer values and returned as 496 /// floating-point values. 497 /// 498 /// \headerfile <x86intrin.h> 499 /// 500 /// \code 501 /// __m256 _mm256_ceil_ps(__m256 V); 502 /// \endcode 503 /// 504 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 505 /// 506 /// \param V 507 /// A 256-bit vector of [8 x float]. 508 /// \returns A 256-bit vector of [8 x float] containing the rounded up values. 509 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 510 511 /// Rounds down the values stored in a 256-bit vector of [8 x float]. The 512 /// source values are rounded down to integer values and returned as 513 /// floating-point values. 514 /// 515 /// \headerfile <x86intrin.h> 516 /// 517 /// \code 518 /// __m256 _mm256_floor_ps(__m256 V); 519 /// \endcode 520 /// 521 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 522 /// 523 /// \param V 524 /// A 256-bit vector of [8 x float]. 525 /// \returns A 256-bit vector of [8 x float] containing the rounded down values. 526 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 527 528 /* Logical */ 529 /// Performs a bitwise AND of two 256-bit vectors of [4 x double]. 530 /// 531 /// \headerfile <x86intrin.h> 532 /// 533 /// This intrinsic corresponds to the <c> VANDPD </c> instruction. 534 /// 535 /// \param __a 536 /// A 256-bit vector of [4 x double] containing one of the source operands. 537 /// \param __b 538 /// A 256-bit vector of [4 x double] containing one of the source operands. 539 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 540 /// values between both operands. 541 static __inline __m256d __DEFAULT_FN_ATTRS 542 _mm256_and_pd(__m256d __a, __m256d __b) 543 { 544 return (__m256d)((__v4du)__a & (__v4du)__b); 545 } 546 547 /// Performs a bitwise AND of two 256-bit vectors of [8 x float]. 548 /// 549 /// \headerfile <x86intrin.h> 550 /// 551 /// This intrinsic corresponds to the <c> VANDPS </c> instruction. 552 /// 553 /// \param __a 554 /// A 256-bit vector of [8 x float] containing one of the source operands. 555 /// \param __b 556 /// A 256-bit vector of [8 x float] containing one of the source operands. 557 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 558 /// values between both operands. 559 static __inline __m256 __DEFAULT_FN_ATTRS 560 _mm256_and_ps(__m256 __a, __m256 __b) 561 { 562 return (__m256)((__v8su)__a & (__v8su)__b); 563 } 564 565 /// Performs a bitwise AND of two 256-bit vectors of [4 x double], using 566 /// the one's complement of the values contained in the first source operand. 567 /// 568 /// \headerfile <x86intrin.h> 569 /// 570 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 571 /// 572 /// \param __a 573 /// A 256-bit vector of [4 x double] containing the left source operand. The 574 /// one's complement of this value is used in the bitwise AND. 575 /// \param __b 576 /// A 256-bit vector of [4 x double] containing the right source operand. 577 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 578 /// values of the second operand and the one's complement of the first 579 /// operand. 580 static __inline __m256d __DEFAULT_FN_ATTRS 581 _mm256_andnot_pd(__m256d __a, __m256d __b) 582 { 583 return (__m256d)(~(__v4du)__a & (__v4du)__b); 584 } 585 586 /// Performs a bitwise AND of two 256-bit vectors of [8 x float], using 587 /// the one's complement of the values contained in the first source operand. 588 /// 589 /// \headerfile <x86intrin.h> 590 /// 591 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 592 /// 593 /// \param __a 594 /// A 256-bit vector of [8 x float] containing the left source operand. The 595 /// one's complement of this value is used in the bitwise AND. 596 /// \param __b 597 /// A 256-bit vector of [8 x float] containing the right source operand. 598 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 599 /// values of the second operand and the one's complement of the first 600 /// operand. 601 static __inline __m256 __DEFAULT_FN_ATTRS 602 _mm256_andnot_ps(__m256 __a, __m256 __b) 603 { 604 return (__m256)(~(__v8su)__a & (__v8su)__b); 605 } 606 607 /// Performs a bitwise OR of two 256-bit vectors of [4 x double]. 608 /// 609 /// \headerfile <x86intrin.h> 610 /// 611 /// This intrinsic corresponds to the <c> VORPD </c> instruction. 612 /// 613 /// \param __a 614 /// A 256-bit vector of [4 x double] containing one of the source operands. 615 /// \param __b 616 /// A 256-bit vector of [4 x double] containing one of the source operands. 617 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 618 /// values between both operands. 619 static __inline __m256d __DEFAULT_FN_ATTRS 620 _mm256_or_pd(__m256d __a, __m256d __b) 621 { 622 return (__m256d)((__v4du)__a | (__v4du)__b); 623 } 624 625 /// Performs a bitwise OR of two 256-bit vectors of [8 x float]. 626 /// 627 /// \headerfile <x86intrin.h> 628 /// 629 /// This intrinsic corresponds to the <c> VORPS </c> instruction. 630 /// 631 /// \param __a 632 /// A 256-bit vector of [8 x float] containing one of the source operands. 633 /// \param __b 634 /// A 256-bit vector of [8 x float] containing one of the source operands. 635 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 636 /// values between both operands. 637 static __inline __m256 __DEFAULT_FN_ATTRS 638 _mm256_or_ps(__m256 __a, __m256 __b) 639 { 640 return (__m256)((__v8su)__a | (__v8su)__b); 641 } 642 643 /// Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 644 /// 645 /// \headerfile <x86intrin.h> 646 /// 647 /// This intrinsic corresponds to the <c> VXORPD </c> instruction. 648 /// 649 /// \param __a 650 /// A 256-bit vector of [4 x double] containing one of the source operands. 651 /// \param __b 652 /// A 256-bit vector of [4 x double] containing one of the source operands. 653 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 654 /// values between both operands. 655 static __inline __m256d __DEFAULT_FN_ATTRS 656 _mm256_xor_pd(__m256d __a, __m256d __b) 657 { 658 return (__m256d)((__v4du)__a ^ (__v4du)__b); 659 } 660 661 /// Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 662 /// 663 /// \headerfile <x86intrin.h> 664 /// 665 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 666 /// 667 /// \param __a 668 /// A 256-bit vector of [8 x float] containing one of the source operands. 669 /// \param __b 670 /// A 256-bit vector of [8 x float] containing one of the source operands. 671 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 672 /// values between both operands. 673 static __inline __m256 __DEFAULT_FN_ATTRS 674 _mm256_xor_ps(__m256 __a, __m256 __b) 675 { 676 return (__m256)((__v8su)__a ^ (__v8su)__b); 677 } 678 679 /* Horizontal arithmetic */ 680 /// Horizontally adds the adjacent pairs of values contained in two 681 /// 256-bit vectors of [4 x double]. 682 /// 683 /// \headerfile <x86intrin.h> 684 /// 685 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 686 /// 687 /// \param __a 688 /// A 256-bit vector of [4 x double] containing one of the source operands. 689 /// The horizontal sums of the values are returned in the even-indexed 690 /// elements of a vector of [4 x double]. 691 /// \param __b 692 /// A 256-bit vector of [4 x double] containing one of the source operands. 693 /// The horizontal sums of the values are returned in the odd-indexed 694 /// elements of a vector of [4 x double]. 695 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 696 /// both operands. 697 static __inline __m256d __DEFAULT_FN_ATTRS 698 _mm256_hadd_pd(__m256d __a, __m256d __b) 699 { 700 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 701 } 702 703 /// Horizontally adds the adjacent pairs of values contained in two 704 /// 256-bit vectors of [8 x float]. 705 /// 706 /// \headerfile <x86intrin.h> 707 /// 708 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 709 /// 710 /// \param __a 711 /// A 256-bit vector of [8 x float] containing one of the source operands. 712 /// The horizontal sums of the values are returned in the elements with 713 /// index 0, 1, 4, 5 of a vector of [8 x float]. 714 /// \param __b 715 /// A 256-bit vector of [8 x float] containing one of the source operands. 716 /// The horizontal sums of the values are returned in the elements with 717 /// index 2, 3, 6, 7 of a vector of [8 x float]. 718 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 719 /// both operands. 720 static __inline __m256 __DEFAULT_FN_ATTRS 721 _mm256_hadd_ps(__m256 __a, __m256 __b) 722 { 723 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 724 } 725 726 /// Horizontally subtracts the adjacent pairs of values contained in two 727 /// 256-bit vectors of [4 x double]. 728 /// 729 /// \headerfile <x86intrin.h> 730 /// 731 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 732 /// 733 /// \param __a 734 /// A 256-bit vector of [4 x double] containing one of the source operands. 735 /// The horizontal differences between the values are returned in the 736 /// even-indexed elements of a vector of [4 x double]. 737 /// \param __b 738 /// A 256-bit vector of [4 x double] containing one of the source operands. 739 /// The horizontal differences between the values are returned in the 740 /// odd-indexed elements of a vector of [4 x double]. 741 /// \returns A 256-bit vector of [4 x double] containing the horizontal 742 /// differences of both operands. 743 static __inline __m256d __DEFAULT_FN_ATTRS 744 _mm256_hsub_pd(__m256d __a, __m256d __b) 745 { 746 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 747 } 748 749 /// Horizontally subtracts the adjacent pairs of values contained in two 750 /// 256-bit vectors of [8 x float]. 751 /// 752 /// \headerfile <x86intrin.h> 753 /// 754 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 755 /// 756 /// \param __a 757 /// A 256-bit vector of [8 x float] containing one of the source operands. 758 /// The horizontal differences between the values are returned in the 759 /// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 760 /// \param __b 761 /// A 256-bit vector of [8 x float] containing one of the source operands. 762 /// The horizontal differences between the values are returned in the 763 /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 764 /// \returns A 256-bit vector of [8 x float] containing the horizontal 765 /// differences of both operands. 766 static __inline __m256 __DEFAULT_FN_ATTRS 767 _mm256_hsub_ps(__m256 __a, __m256 __b) 768 { 769 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 770 } 771 772 /* Vector permutations */ 773 /// Copies the values in a 128-bit vector of [2 x double] as specified 774 /// by the 128-bit integer vector operand. 775 /// 776 /// \headerfile <x86intrin.h> 777 /// 778 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 779 /// 780 /// \param __a 781 /// A 128-bit vector of [2 x double]. 782 /// \param __c 783 /// A 128-bit integer vector operand specifying how the values are to be 784 /// copied. \n 785 /// Bit [1]: \n 786 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 787 /// vector. \n 788 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 789 /// returned vector. \n 790 /// Bit [65]: \n 791 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 792 /// returned vector. \n 793 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 794 /// returned vector. 795 /// \returns A 128-bit vector of [2 x double] containing the copied values. 796 static __inline __m128d __DEFAULT_FN_ATTRS128 797 _mm_permutevar_pd(__m128d __a, __m128i __c) 798 { 799 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 800 } 801 802 /// Copies the values in a 256-bit vector of [4 x double] as specified 803 /// by the 256-bit integer vector operand. 804 /// 805 /// \headerfile <x86intrin.h> 806 /// 807 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 808 /// 809 /// \param __a 810 /// A 256-bit vector of [4 x double]. 811 /// \param __c 812 /// A 256-bit integer vector operand specifying how the values are to be 813 /// copied. \n 814 /// Bit [1]: \n 815 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 816 /// vector. \n 817 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 818 /// returned vector. \n 819 /// Bit [65]: \n 820 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 821 /// returned vector. \n 822 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 823 /// returned vector. \n 824 /// Bit [129]: \n 825 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 826 /// returned vector. \n 827 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 828 /// returned vector. \n 829 /// Bit [193]: \n 830 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 831 /// returned vector. \n 832 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 833 /// returned vector. 834 /// \returns A 256-bit vector of [4 x double] containing the copied values. 835 static __inline __m256d __DEFAULT_FN_ATTRS 836 _mm256_permutevar_pd(__m256d __a, __m256i __c) 837 { 838 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 839 } 840 841 /// Copies the values stored in a 128-bit vector of [4 x float] as 842 /// specified by the 128-bit integer vector operand. 843 /// 844 /// \headerfile <x86intrin.h> 845 /// 846 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 847 /// 848 /// \param __a 849 /// A 128-bit vector of [4 x float]. 850 /// \param __c 851 /// A 128-bit integer vector operand specifying how the values are to be 852 /// copied. \n 853 /// Bits [1:0]: \n 854 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 855 /// returned vector. \n 856 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 857 /// returned vector. \n 858 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 859 /// returned vector. \n 860 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 861 /// returned vector. \n 862 /// Bits [33:32]: \n 863 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 864 /// returned vector. \n 865 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 866 /// returned vector. \n 867 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 868 /// returned vector. \n 869 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 870 /// returned vector. \n 871 /// Bits [65:64]: \n 872 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 873 /// returned vector. \n 874 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 875 /// returned vector. \n 876 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 877 /// returned vector. \n 878 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 879 /// returned vector. \n 880 /// Bits [97:96]: \n 881 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 882 /// returned vector. \n 883 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 884 /// returned vector. \n 885 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 886 /// returned vector. \n 887 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 888 /// returned vector. 889 /// \returns A 128-bit vector of [4 x float] containing the copied values. 890 static __inline __m128 __DEFAULT_FN_ATTRS128 891 _mm_permutevar_ps(__m128 __a, __m128i __c) 892 { 893 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 894 } 895 896 /// Copies the values stored in a 256-bit vector of [8 x float] as 897 /// specified by the 256-bit integer vector operand. 898 /// 899 /// \headerfile <x86intrin.h> 900 /// 901 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 902 /// 903 /// \param __a 904 /// A 256-bit vector of [8 x float]. 905 /// \param __c 906 /// A 256-bit integer vector operand specifying how the values are to be 907 /// copied. \n 908 /// Bits [1:0]: \n 909 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 910 /// returned vector. \n 911 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 912 /// returned vector. \n 913 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 914 /// returned vector. \n 915 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 916 /// returned vector. \n 917 /// Bits [33:32]: \n 918 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 919 /// returned vector. \n 920 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 921 /// returned vector. \n 922 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 923 /// returned vector. \n 924 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 925 /// returned vector. \n 926 /// Bits [65:64]: \n 927 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 928 /// returned vector. \n 929 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 930 /// returned vector. \n 931 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 932 /// returned vector. \n 933 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 934 /// returned vector. \n 935 /// Bits [97:96]: \n 936 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 937 /// returned vector. \n 938 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 939 /// returned vector. \n 940 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 941 /// returned vector. \n 942 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 943 /// returned vector. \n 944 /// Bits [129:128]: \n 945 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 946 /// returned vector. \n 947 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 948 /// returned vector. \n 949 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 950 /// returned vector. \n 951 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 952 /// returned vector. \n 953 /// Bits [161:160]: \n 954 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 955 /// returned vector. \n 956 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 957 /// returned vector. \n 958 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 959 /// returned vector. \n 960 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 961 /// returned vector. \n 962 /// Bits [193:192]: \n 963 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 964 /// returned vector. \n 965 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 966 /// returned vector. \n 967 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 968 /// returned vector. \n 969 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 970 /// returned vector. \n 971 /// Bits [225:224]: \n 972 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 973 /// returned vector. \n 974 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 975 /// returned vector. \n 976 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 977 /// returned vector. \n 978 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 979 /// returned vector. 980 /// \returns A 256-bit vector of [8 x float] containing the copied values. 981 static __inline __m256 __DEFAULT_FN_ATTRS 982 _mm256_permutevar_ps(__m256 __a, __m256i __c) 983 { 984 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 985 } 986 987 /// Copies the values in a 128-bit vector of [2 x double] as specified 988 /// by the immediate integer operand. 989 /// 990 /// \headerfile <x86intrin.h> 991 /// 992 /// \code 993 /// __m128d _mm_permute_pd(__m128d A, const int C); 994 /// \endcode 995 /// 996 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 997 /// 998 /// \param A 999 /// A 128-bit vector of [2 x double]. 1000 /// \param C 1001 /// An immediate integer operand specifying how the values are to be 1002 /// copied. \n 1003 /// Bit [0]: \n 1004 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1005 /// vector. \n 1006 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1007 /// returned vector. \n 1008 /// Bit [1]: \n 1009 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1010 /// returned vector. \n 1011 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1012 /// returned vector. 1013 /// \returns A 128-bit vector of [2 x double] containing the copied values. 1014 #define _mm_permute_pd(A, C) \ 1015 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))) 1016 1017 /// Copies the values in a 256-bit vector of [4 x double] as specified by 1018 /// the immediate integer operand. 1019 /// 1020 /// \headerfile <x86intrin.h> 1021 /// 1022 /// \code 1023 /// __m256d _mm256_permute_pd(__m256d A, const int C); 1024 /// \endcode 1025 /// 1026 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1027 /// 1028 /// \param A 1029 /// A 256-bit vector of [4 x double]. 1030 /// \param C 1031 /// An immediate integer operand specifying how the values are to be 1032 /// copied. \n 1033 /// Bit [0]: \n 1034 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1035 /// vector. \n 1036 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1037 /// returned vector. \n 1038 /// Bit [1]: \n 1039 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1040 /// returned vector. \n 1041 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1042 /// returned vector. \n 1043 /// Bit [2]: \n 1044 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1045 /// returned vector. \n 1046 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1047 /// returned vector. \n 1048 /// Bit [3]: \n 1049 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1050 /// returned vector. \n 1051 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1052 /// returned vector. 1053 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1054 #define _mm256_permute_pd(A, C) \ 1055 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))) 1056 1057 /// Copies the values in a 128-bit vector of [4 x float] as specified by 1058 /// the immediate integer operand. 1059 /// 1060 /// \headerfile <x86intrin.h> 1061 /// 1062 /// \code 1063 /// __m128 _mm_permute_ps(__m128 A, const int C); 1064 /// \endcode 1065 /// 1066 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1067 /// 1068 /// \param A 1069 /// A 128-bit vector of [4 x float]. 1070 /// \param C 1071 /// An immediate integer operand specifying how the values are to be 1072 /// copied. \n 1073 /// Bits [1:0]: \n 1074 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1075 /// returned vector. \n 1076 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1077 /// returned vector. \n 1078 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1079 /// returned vector. \n 1080 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1081 /// returned vector. \n 1082 /// Bits [3:2]: \n 1083 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1084 /// returned vector. \n 1085 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1086 /// returned vector. \n 1087 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1088 /// returned vector. \n 1089 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1090 /// returned vector. \n 1091 /// Bits [5:4]: \n 1092 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1093 /// returned vector. \n 1094 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1095 /// returned vector. \n 1096 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1097 /// returned vector. \n 1098 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1099 /// returned vector. \n 1100 /// Bits [7:6]: \n 1101 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1102 /// returned vector. \n 1103 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1104 /// returned vector. \n 1105 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1106 /// returned vector. \n 1107 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1108 /// returned vector. 1109 /// \returns A 128-bit vector of [4 x float] containing the copied values. 1110 #define _mm_permute_ps(A, C) \ 1111 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))) 1112 1113 /// Copies the values in a 256-bit vector of [8 x float] as specified by 1114 /// the immediate integer operand. 1115 /// 1116 /// \headerfile <x86intrin.h> 1117 /// 1118 /// \code 1119 /// __m256 _mm256_permute_ps(__m256 A, const int C); 1120 /// \endcode 1121 /// 1122 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1123 /// 1124 /// \param A 1125 /// A 256-bit vector of [8 x float]. 1126 /// \param C 1127 /// An immediate integer operand specifying how the values are to be 1128 /// copied. \n 1129 /// Bits [1:0]: \n 1130 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1131 /// returned vector. \n 1132 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1133 /// returned vector. \n 1134 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1135 /// returned vector. \n 1136 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1137 /// returned vector. \n 1138 /// Bits [3:2]: \n 1139 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1140 /// returned vector. \n 1141 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1142 /// returned vector. \n 1143 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1144 /// returned vector. \n 1145 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1146 /// returned vector. \n 1147 /// Bits [5:4]: \n 1148 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1149 /// returned vector. \n 1150 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1151 /// returned vector. \n 1152 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1153 /// returned vector. \n 1154 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1155 /// returned vector. \n 1156 /// Bits [7:6]: \n 1157 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1158 /// returned vector. \n 1159 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1160 /// returned vector. \n 1161 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1162 /// returned vector. \n 1163 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1164 /// returned vector. \n 1165 /// Bits [1:0]: \n 1166 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1167 /// returned vector. \n 1168 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1169 /// returned vector. \n 1170 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1171 /// returned vector. \n 1172 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1173 /// returned vector. \n 1174 /// Bits [3:2]: \n 1175 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1176 /// returned vector. \n 1177 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1178 /// returned vector. \n 1179 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1180 /// returned vector. \n 1181 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1182 /// returned vector. \n 1183 /// Bits [5:4]: \n 1184 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1185 /// returned vector. \n 1186 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1187 /// returned vector. \n 1188 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1189 /// returned vector. \n 1190 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1191 /// returned vector. \n 1192 /// Bits [7:6]: \n 1193 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1194 /// returned vector. \n 1195 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1196 /// returned vector. \n 1197 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1198 /// returned vector. \n 1199 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1200 /// returned vector. 1201 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1202 #define _mm256_permute_ps(A, C) \ 1203 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))) 1204 1205 /// Permutes 128-bit data values stored in two 256-bit vectors of 1206 /// [4 x double], as specified by the immediate integer operand. 1207 /// 1208 /// \headerfile <x86intrin.h> 1209 /// 1210 /// \code 1211 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1212 /// \endcode 1213 /// 1214 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1215 /// 1216 /// \param V1 1217 /// A 256-bit vector of [4 x double]. 1218 /// \param V2 1219 /// A 256-bit vector of [4 x double. 1220 /// \param M 1221 /// An immediate integer operand specifying how the values are to be 1222 /// permuted. \n 1223 /// Bits [1:0]: \n 1224 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1225 /// destination. \n 1226 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1227 /// destination. \n 1228 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1229 /// destination. \n 1230 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1231 /// destination. \n 1232 /// Bits [5:4]: \n 1233 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1234 /// destination. \n 1235 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1236 /// destination. \n 1237 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1238 /// destination. \n 1239 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1240 /// destination. 1241 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1242 #define _mm256_permute2f128_pd(V1, V2, M) \ 1243 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1244 (__v4df)(__m256d)(V2), (int)(M))) 1245 1246 /// Permutes 128-bit data values stored in two 256-bit vectors of 1247 /// [8 x float], as specified by the immediate integer operand. 1248 /// 1249 /// \headerfile <x86intrin.h> 1250 /// 1251 /// \code 1252 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1253 /// \endcode 1254 /// 1255 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1256 /// 1257 /// \param V1 1258 /// A 256-bit vector of [8 x float]. 1259 /// \param V2 1260 /// A 256-bit vector of [8 x float]. 1261 /// \param M 1262 /// An immediate integer operand specifying how the values are to be 1263 /// permuted. \n 1264 /// Bits [1:0]: \n 1265 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1266 /// destination. \n 1267 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1268 /// destination. \n 1269 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1270 /// destination. \n 1271 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1272 /// destination. \n 1273 /// Bits [5:4]: \n 1274 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1275 /// destination. \n 1276 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1277 /// destination. \n 1278 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1279 /// destination. \n 1280 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1281 /// destination. 1282 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1283 #define _mm256_permute2f128_ps(V1, V2, M) \ 1284 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1285 (__v8sf)(__m256)(V2), (int)(M))) 1286 1287 /// Permutes 128-bit data values stored in two 256-bit integer vectors, 1288 /// as specified by the immediate integer operand. 1289 /// 1290 /// \headerfile <x86intrin.h> 1291 /// 1292 /// \code 1293 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1294 /// \endcode 1295 /// 1296 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1297 /// 1298 /// \param V1 1299 /// A 256-bit integer vector. 1300 /// \param V2 1301 /// A 256-bit integer vector. 1302 /// \param M 1303 /// An immediate integer operand specifying how the values are to be copied. 1304 /// Bits [1:0]: \n 1305 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1306 /// destination. \n 1307 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1308 /// destination. \n 1309 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1310 /// destination. \n 1311 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1312 /// destination. \n 1313 /// Bits [5:4]: \n 1314 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1315 /// destination. \n 1316 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1317 /// destination. \n 1318 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1319 /// destination. \n 1320 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1321 /// destination. 1322 /// \returns A 256-bit integer vector containing the copied values. 1323 #define _mm256_permute2f128_si256(V1, V2, M) \ 1324 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1325 (__v8si)(__m256i)(V2), (int)(M))) 1326 1327 /* Vector Blend */ 1328 /// Merges 64-bit double-precision data values stored in either of the 1329 /// two 256-bit vectors of [4 x double], as specified by the immediate 1330 /// integer operand. 1331 /// 1332 /// \headerfile <x86intrin.h> 1333 /// 1334 /// \code 1335 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1336 /// \endcode 1337 /// 1338 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1339 /// 1340 /// \param V1 1341 /// A 256-bit vector of [4 x double]. 1342 /// \param V2 1343 /// A 256-bit vector of [4 x double]. 1344 /// \param M 1345 /// An immediate integer operand, with mask bits [3:0] specifying how the 1346 /// values are to be copied. The position of the mask bit corresponds to the 1347 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1348 /// element in operand \a V1 is copied to the same position in the 1349 /// destination. When a mask bit is 1, the corresponding 64-bit element in 1350 /// operand \a V2 is copied to the same position in the destination. 1351 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1352 #define _mm256_blend_pd(V1, V2, M) \ 1353 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ 1354 (__v4df)(__m256d)(V2), (int)(M))) 1355 1356 /// Merges 32-bit single-precision data values stored in either of the 1357 /// two 256-bit vectors of [8 x float], as specified by the immediate 1358 /// integer operand. 1359 /// 1360 /// \headerfile <x86intrin.h> 1361 /// 1362 /// \code 1363 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1364 /// \endcode 1365 /// 1366 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1367 /// 1368 /// \param V1 1369 /// A 256-bit vector of [8 x float]. 1370 /// \param V2 1371 /// A 256-bit vector of [8 x float]. 1372 /// \param M 1373 /// An immediate integer operand, with mask bits [7:0] specifying how the 1374 /// values are to be copied. The position of the mask bit corresponds to the 1375 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1376 /// element in operand \a V1 is copied to the same position in the 1377 /// destination. When a mask bit is 1, the corresponding 32-bit element in 1378 /// operand \a V2 is copied to the same position in the destination. 1379 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1380 #define _mm256_blend_ps(V1, V2, M) \ 1381 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ 1382 (__v8sf)(__m256)(V2), (int)(M))) 1383 1384 /// Merges 64-bit double-precision data values stored in either of the 1385 /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1386 /// operand. 1387 /// 1388 /// \headerfile <x86intrin.h> 1389 /// 1390 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1391 /// 1392 /// \param __a 1393 /// A 256-bit vector of [4 x double]. 1394 /// \param __b 1395 /// A 256-bit vector of [4 x double]. 1396 /// \param __c 1397 /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1398 /// how the values are to be copied. The position of the mask bit corresponds 1399 /// to the most significant bit of a copied value. When a mask bit is 0, the 1400 /// corresponding 64-bit element in operand \a __a is copied to the same 1401 /// position in the destination. When a mask bit is 1, the corresponding 1402 /// 64-bit element in operand \a __b is copied to the same position in the 1403 /// destination. 1404 /// \returns A 256-bit vector of [4 x double] containing the copied values. 1405 static __inline __m256d __DEFAULT_FN_ATTRS 1406 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1407 { 1408 return (__m256d)__builtin_ia32_blendvpd256( 1409 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1410 } 1411 1412 /// Merges 32-bit single-precision data values stored in either of the 1413 /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1414 /// operand. 1415 /// 1416 /// \headerfile <x86intrin.h> 1417 /// 1418 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1419 /// 1420 /// \param __a 1421 /// A 256-bit vector of [8 x float]. 1422 /// \param __b 1423 /// A 256-bit vector of [8 x float]. 1424 /// \param __c 1425 /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1426 /// and 31 specifying how the values are to be copied. The position of the 1427 /// mask bit corresponds to the most significant bit of a copied value. When 1428 /// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1429 /// copied to the same position in the destination. When a mask bit is 1, the 1430 /// corresponding 32-bit element in operand \a __b is copied to the same 1431 /// position in the destination. 1432 /// \returns A 256-bit vector of [8 x float] containing the copied values. 1433 static __inline __m256 __DEFAULT_FN_ATTRS 1434 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1435 { 1436 return (__m256)__builtin_ia32_blendvps256( 1437 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1438 } 1439 1440 /* Vector Dot Product */ 1441 /// Computes two dot products in parallel, using the lower and upper 1442 /// halves of two [8 x float] vectors as input to the two computations, and 1443 /// returning the two dot products in the lower and upper halves of the 1444 /// [8 x float] result. 1445 /// 1446 /// The immediate integer operand controls which input elements will 1447 /// contribute to the dot product, and where the final results are returned. 1448 /// In general, for each dot product, the four corresponding elements of the 1449 /// input vectors are multiplied; the first two and second two products are 1450 /// summed, then the two sums are added to form the final result. 1451 /// 1452 /// \headerfile <x86intrin.h> 1453 /// 1454 /// \code 1455 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1456 /// \endcode 1457 /// 1458 /// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1459 /// 1460 /// \param V1 1461 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1462 /// \param V2 1463 /// A vector of [8 x float] values, treated as two [4 x float] vectors. 1464 /// \param M 1465 /// An immediate integer argument. Bits [7:4] determine which elements of 1466 /// the input vectors are used, with bit [4] corresponding to the lowest 1467 /// element and bit [7] corresponding to the highest element of each [4 x 1468 /// float] subvector. If a bit is set, the corresponding elements from the 1469 /// two input vectors are used as an input for dot product; otherwise that 1470 /// input is treated as zero. Bits [3:0] determine which elements of the 1471 /// result will receive a copy of the final dot product, with bit [0] 1472 /// corresponding to the lowest element and bit [3] corresponding to the 1473 /// highest element of each [4 x float] subvector. If a bit is set, the dot 1474 /// product is returned in the corresponding element; otherwise that element 1475 /// is set to zero. The bitmask is applied in the same way to each of the 1476 /// two parallel dot product computations. 1477 /// \returns A 256-bit vector of [8 x float] containing the two dot products. 1478 #define _mm256_dp_ps(V1, V2, M) \ 1479 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1480 (__v8sf)(__m256)(V2), (M))) 1481 1482 /* Vector shuffle */ 1483 /// Selects 8 float values from the 256-bit operands of [8 x float], as 1484 /// specified by the immediate value operand. 1485 /// 1486 /// The four selected elements in each operand are copied to the destination 1487 /// according to the bits specified in the immediate operand. The selected 1488 /// elements from the first 256-bit operand are copied to bits [63:0] and 1489 /// bits [191:128] of the destination, and the selected elements from the 1490 /// second 256-bit operand are copied to bits [127:64] and bits [255:192] of 1491 /// the destination. For example, if bits [7:0] of the immediate operand 1492 /// contain a value of 0xFF, the 256-bit destination vector would contain the 1493 /// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1494 /// 1495 /// \headerfile <x86intrin.h> 1496 /// 1497 /// \code 1498 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1499 /// \endcode 1500 /// 1501 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1502 /// 1503 /// \param a 1504 /// A 256-bit vector of [8 x float]. The four selected elements in this 1505 /// operand are copied to bits [63:0] and bits [191:128] in the destination, 1506 /// according to the bits specified in the immediate operand. 1507 /// \param b 1508 /// A 256-bit vector of [8 x float]. The four selected elements in this 1509 /// operand are copied to bits [127:64] and bits [255:192] in the 1510 /// destination, according to the bits specified in the immediate operand. 1511 /// \param mask 1512 /// An immediate value containing an 8-bit value specifying which elements to 1513 /// copy from \a a and \a b \n. 1514 /// Bits [3:0] specify the values copied from operand \a a. \n 1515 /// Bits [7:4] specify the values copied from operand \a b. \n 1516 /// The destinations within the 256-bit destination are assigned values as 1517 /// follows, according to the bit value assignments described below: \n 1518 /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1519 /// destination. \n 1520 /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1521 /// destination. \n 1522 /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1523 /// destination. \n 1524 /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1525 /// the destination. \n 1526 /// Bit value assignments: \n 1527 /// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1528 /// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1529 /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1530 /// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n 1531 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 1532 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 1533 /// <c>[b6, b4, b2, b0]</c>. 1534 /// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1535 #define _mm256_shuffle_ps(a, b, mask) \ 1536 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ 1537 (__v8sf)(__m256)(b), (int)(mask))) 1538 1539 /// Selects four double-precision values from the 256-bit operands of 1540 /// [4 x double], as specified by the immediate value operand. 1541 /// 1542 /// The selected elements from the first 256-bit operand are copied to bits 1543 /// [63:0] and bits [191:128] in the destination, and the selected elements 1544 /// from the second 256-bit operand are copied to bits [127:64] and bits 1545 /// [255:192] in the destination. For example, if bits [3:0] of the immediate 1546 /// operand contain a value of 0xF, the 256-bit destination vector would 1547 /// contain the following values: b[3], a[3], b[1], a[1]. 1548 /// 1549 /// \headerfile <x86intrin.h> 1550 /// 1551 /// \code 1552 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1553 /// \endcode 1554 /// 1555 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1556 /// 1557 /// \param a 1558 /// A 256-bit vector of [4 x double]. 1559 /// \param b 1560 /// A 256-bit vector of [4 x double]. 1561 /// \param mask 1562 /// An immediate value containing 8-bit values specifying which elements to 1563 /// copy from \a a and \a b: \n 1564 /// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1565 /// destination. \n 1566 /// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1567 /// destination. \n 1568 /// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1569 /// destination. \n 1570 /// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1571 /// destination. \n 1572 /// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1573 /// destination. \n 1574 /// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1575 /// destination. \n 1576 /// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1577 /// destination. \n 1578 /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1579 /// destination. 1580 /// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1581 #define _mm256_shuffle_pd(a, b, mask) \ 1582 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ 1583 (__v4df)(__m256d)(b), (int)(mask))) 1584 1585 /* Compare */ 1586 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1587 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1588 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1589 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1590 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1591 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1592 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1593 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1594 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1595 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1596 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1597 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1598 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1599 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1600 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1601 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1602 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1603 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1604 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1605 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1606 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1607 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1608 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1609 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1610 1611 /* Below intrinsic defined in emmintrin.h can be used for AVX */ 1612 /// Compares each of the corresponding double-precision values of two 1613 /// 128-bit vectors of [2 x double], using the operation specified by the 1614 /// immediate integer operand. 1615 /// 1616 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1617 /// If either value in a comparison is NaN, comparisons that are ordered 1618 /// return false, and comparisons that are unordered return true. 1619 /// 1620 /// \headerfile <x86intrin.h> 1621 /// 1622 /// \code 1623 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1624 /// \endcode 1625 /// 1626 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1627 /// 1628 /// \param a 1629 /// A 128-bit vector of [2 x double]. 1630 /// \param b 1631 /// A 128-bit vector of [2 x double]. 1632 /// \param c 1633 /// An immediate integer operand, with bits [4:0] specifying which comparison 1634 /// operation to use: \n 1635 /// 0x00: Equal (ordered, non-signaling) \n 1636 /// 0x01: Less-than (ordered, signaling) \n 1637 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1638 /// 0x03: Unordered (non-signaling) \n 1639 /// 0x04: Not-equal (unordered, non-signaling) \n 1640 /// 0x05: Not-less-than (unordered, signaling) \n 1641 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1642 /// 0x07: Ordered (non-signaling) \n 1643 /// 0x08: Equal (unordered, non-signaling) \n 1644 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1645 /// 0x0A: Not-greater-than (unordered, signaling) \n 1646 /// 0x0B: False (ordered, non-signaling) \n 1647 /// 0x0C: Not-equal (ordered, non-signaling) \n 1648 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1649 /// 0x0E: Greater-than (ordered, signaling) \n 1650 /// 0x0F: True (unordered, non-signaling) \n 1651 /// 0x10: Equal (ordered, signaling) \n 1652 /// 0x11: Less-than (ordered, non-signaling) \n 1653 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1654 /// 0x13: Unordered (signaling) \n 1655 /// 0x14: Not-equal (unordered, signaling) \n 1656 /// 0x15: Not-less-than (unordered, non-signaling) \n 1657 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1658 /// 0x17: Ordered (signaling) \n 1659 /// 0x18: Equal (unordered, signaling) \n 1660 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1661 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1662 /// 0x1B: False (ordered, signaling) \n 1663 /// 0x1C: Not-equal (ordered, signaling) \n 1664 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1665 /// 0x1E: Greater-than (ordered, non-signaling) \n 1666 /// 0x1F: True (unordered, signaling) 1667 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1668 /// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c) 1669 1670 /* Below intrinsic defined in xmmintrin.h can be used for AVX */ 1671 /// Compares each of the corresponding values of two 128-bit vectors of 1672 /// [4 x float], using the operation specified by the immediate integer 1673 /// operand. 1674 /// 1675 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 1676 /// If either value in a comparison is NaN, comparisons that are ordered 1677 /// return false, and comparisons that are unordered return true. 1678 /// 1679 /// \headerfile <x86intrin.h> 1680 /// 1681 /// \code 1682 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1683 /// \endcode 1684 /// 1685 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1686 /// 1687 /// \param a 1688 /// A 128-bit vector of [4 x float]. 1689 /// \param b 1690 /// A 128-bit vector of [4 x float]. 1691 /// \param c 1692 /// An immediate integer operand, with bits [4:0] specifying which comparison 1693 /// operation to use: \n 1694 /// 0x00: Equal (ordered, non-signaling) \n 1695 /// 0x01: Less-than (ordered, signaling) \n 1696 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1697 /// 0x03: Unordered (non-signaling) \n 1698 /// 0x04: Not-equal (unordered, non-signaling) \n 1699 /// 0x05: Not-less-than (unordered, signaling) \n 1700 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1701 /// 0x07: Ordered (non-signaling) \n 1702 /// 0x08: Equal (unordered, non-signaling) \n 1703 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1704 /// 0x0A: Not-greater-than (unordered, signaling) \n 1705 /// 0x0B: False (ordered, non-signaling) \n 1706 /// 0x0C: Not-equal (ordered, non-signaling) \n 1707 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1708 /// 0x0E: Greater-than (ordered, signaling) \n 1709 /// 0x0F: True (unordered, non-signaling) \n 1710 /// 0x10: Equal (ordered, signaling) \n 1711 /// 0x11: Less-than (ordered, non-signaling) \n 1712 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1713 /// 0x13: Unordered (signaling) \n 1714 /// 0x14: Not-equal (unordered, signaling) \n 1715 /// 0x15: Not-less-than (unordered, non-signaling) \n 1716 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1717 /// 0x17: Ordered (signaling) \n 1718 /// 0x18: Equal (unordered, signaling) \n 1719 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1720 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1721 /// 0x1B: False (ordered, signaling) \n 1722 /// 0x1C: Not-equal (ordered, signaling) \n 1723 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1724 /// 0x1E: Greater-than (ordered, non-signaling) \n 1725 /// 0x1F: True (unordered, signaling) 1726 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1727 /// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c) 1728 1729 /// Compares each of the corresponding double-precision values of two 1730 /// 256-bit vectors of [4 x double], using the operation specified by the 1731 /// immediate integer operand. 1732 /// 1733 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1734 /// If either value in a comparison is NaN, comparisons that are ordered 1735 /// return false, and comparisons that are unordered return true. 1736 /// 1737 /// \headerfile <x86intrin.h> 1738 /// 1739 /// \code 1740 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1741 /// \endcode 1742 /// 1743 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1744 /// 1745 /// \param a 1746 /// A 256-bit vector of [4 x double]. 1747 /// \param b 1748 /// A 256-bit vector of [4 x double]. 1749 /// \param c 1750 /// An immediate integer operand, with bits [4:0] specifying which comparison 1751 /// operation to use: \n 1752 /// 0x00: Equal (ordered, non-signaling) \n 1753 /// 0x01: Less-than (ordered, signaling) \n 1754 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1755 /// 0x03: Unordered (non-signaling) \n 1756 /// 0x04: Not-equal (unordered, non-signaling) \n 1757 /// 0x05: Not-less-than (unordered, signaling) \n 1758 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1759 /// 0x07: Ordered (non-signaling) \n 1760 /// 0x08: Equal (unordered, non-signaling) \n 1761 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1762 /// 0x0A: Not-greater-than (unordered, signaling) \n 1763 /// 0x0B: False (ordered, non-signaling) \n 1764 /// 0x0C: Not-equal (ordered, non-signaling) \n 1765 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1766 /// 0x0E: Greater-than (ordered, signaling) \n 1767 /// 0x0F: True (unordered, non-signaling) \n 1768 /// 0x10: Equal (ordered, signaling) \n 1769 /// 0x11: Less-than (ordered, non-signaling) \n 1770 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1771 /// 0x13: Unordered (signaling) \n 1772 /// 0x14: Not-equal (unordered, signaling) \n 1773 /// 0x15: Not-less-than (unordered, non-signaling) \n 1774 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1775 /// 0x17: Ordered (signaling) \n 1776 /// 0x18: Equal (unordered, signaling) \n 1777 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1778 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1779 /// 0x1B: False (ordered, signaling) \n 1780 /// 0x1C: Not-equal (ordered, signaling) \n 1781 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1782 /// 0x1E: Greater-than (ordered, non-signaling) \n 1783 /// 0x1F: True (unordered, signaling) 1784 /// \returns A 256-bit vector of [4 x double] containing the comparison results. 1785 #define _mm256_cmp_pd(a, b, c) \ 1786 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1787 (__v4df)(__m256d)(b), (c))) 1788 1789 /// Compares each of the corresponding values of two 256-bit vectors of 1790 /// [8 x float], using the operation specified by the immediate integer 1791 /// operand. 1792 /// 1793 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 1794 /// If either value in a comparison is NaN, comparisons that are ordered 1795 /// return false, and comparisons that are unordered return true. 1796 /// 1797 /// \headerfile <x86intrin.h> 1798 /// 1799 /// \code 1800 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1801 /// \endcode 1802 /// 1803 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1804 /// 1805 /// \param a 1806 /// A 256-bit vector of [8 x float]. 1807 /// \param b 1808 /// A 256-bit vector of [8 x float]. 1809 /// \param c 1810 /// An immediate integer operand, with bits [4:0] specifying which comparison 1811 /// operation to use: \n 1812 /// 0x00: Equal (ordered, non-signaling) \n 1813 /// 0x01: Less-than (ordered, signaling) \n 1814 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1815 /// 0x03: Unordered (non-signaling) \n 1816 /// 0x04: Not-equal (unordered, non-signaling) \n 1817 /// 0x05: Not-less-than (unordered, signaling) \n 1818 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1819 /// 0x07: Ordered (non-signaling) \n 1820 /// 0x08: Equal (unordered, non-signaling) \n 1821 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1822 /// 0x0A: Not-greater-than (unordered, signaling) \n 1823 /// 0x0B: False (ordered, non-signaling) \n 1824 /// 0x0C: Not-equal (ordered, non-signaling) \n 1825 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1826 /// 0x0E: Greater-than (ordered, signaling) \n 1827 /// 0x0F: True (unordered, non-signaling) \n 1828 /// 0x10: Equal (ordered, signaling) \n 1829 /// 0x11: Less-than (ordered, non-signaling) \n 1830 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1831 /// 0x13: Unordered (signaling) \n 1832 /// 0x14: Not-equal (unordered, signaling) \n 1833 /// 0x15: Not-less-than (unordered, non-signaling) \n 1834 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1835 /// 0x17: Ordered (signaling) \n 1836 /// 0x18: Equal (unordered, signaling) \n 1837 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1838 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1839 /// 0x1B: False (ordered, signaling) \n 1840 /// 0x1C: Not-equal (ordered, signaling) \n 1841 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1842 /// 0x1E: Greater-than (ordered, non-signaling) \n 1843 /// 0x1F: True (unordered, signaling) 1844 /// \returns A 256-bit vector of [8 x float] containing the comparison results. 1845 #define _mm256_cmp_ps(a, b, c) \ 1846 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1847 (__v8sf)(__m256)(b), (c))) 1848 1849 /* Below intrinsic defined in emmintrin.h can be used for AVX */ 1850 /// Compares each of the corresponding scalar double-precision values of 1851 /// two 128-bit vectors of [2 x double], using the operation specified by the 1852 /// immediate integer operand. 1853 /// 1854 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1855 /// If either value in a comparison is NaN, comparisons that are ordered 1856 /// return false, and comparisons that are unordered return true. 1857 /// 1858 /// \headerfile <x86intrin.h> 1859 /// 1860 /// \code 1861 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1862 /// \endcode 1863 /// 1864 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1865 /// 1866 /// \param a 1867 /// A 128-bit vector of [2 x double]. 1868 /// \param b 1869 /// A 128-bit vector of [2 x double]. 1870 /// \param c 1871 /// An immediate integer operand, with bits [4:0] specifying which comparison 1872 /// operation to use: \n 1873 /// 0x00: Equal (ordered, non-signaling) \n 1874 /// 0x01: Less-than (ordered, signaling) \n 1875 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1876 /// 0x03: Unordered (non-signaling) \n 1877 /// 0x04: Not-equal (unordered, non-signaling) \n 1878 /// 0x05: Not-less-than (unordered, signaling) \n 1879 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1880 /// 0x07: Ordered (non-signaling) \n 1881 /// 0x08: Equal (unordered, non-signaling) \n 1882 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1883 /// 0x0A: Not-greater-than (unordered, signaling) \n 1884 /// 0x0B: False (ordered, non-signaling) \n 1885 /// 0x0C: Not-equal (ordered, non-signaling) \n 1886 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1887 /// 0x0E: Greater-than (ordered, signaling) \n 1888 /// 0x0F: True (unordered, non-signaling) \n 1889 /// 0x10: Equal (ordered, signaling) \n 1890 /// 0x11: Less-than (ordered, non-signaling) \n 1891 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1892 /// 0x13: Unordered (signaling) \n 1893 /// 0x14: Not-equal (unordered, signaling) \n 1894 /// 0x15: Not-less-than (unordered, non-signaling) \n 1895 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1896 /// 0x17: Ordered (signaling) \n 1897 /// 0x18: Equal (unordered, signaling) \n 1898 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1899 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1900 /// 0x1B: False (ordered, signaling) \n 1901 /// 0x1C: Not-equal (ordered, signaling) \n 1902 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1903 /// 0x1E: Greater-than (ordered, non-signaling) \n 1904 /// 0x1F: True (unordered, signaling) 1905 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 1906 /// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c) 1907 1908 /* Below intrinsic defined in xmmintrin.h can be used for AVX */ 1909 /// Compares each of the corresponding scalar values of two 128-bit 1910 /// vectors of [4 x float], using the operation specified by the immediate 1911 /// integer operand. 1912 /// 1913 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 1914 /// If either value in a comparison is NaN, comparisons that are ordered 1915 /// return false, and comparisons that are unordered return true. 1916 /// 1917 /// \headerfile <x86intrin.h> 1918 /// 1919 /// \code 1920 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1921 /// \endcode 1922 /// 1923 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1924 /// 1925 /// \param a 1926 /// A 128-bit vector of [4 x float]. 1927 /// \param b 1928 /// A 128-bit vector of [4 x float]. 1929 /// \param c 1930 /// An immediate integer operand, with bits [4:0] specifying which comparison 1931 /// operation to use: \n 1932 /// 0x00: Equal (ordered, non-signaling) \n 1933 /// 0x01: Less-than (ordered, signaling) \n 1934 /// 0x02: Less-than-or-equal (ordered, signaling) \n 1935 /// 0x03: Unordered (non-signaling) \n 1936 /// 0x04: Not-equal (unordered, non-signaling) \n 1937 /// 0x05: Not-less-than (unordered, signaling) \n 1938 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 1939 /// 0x07: Ordered (non-signaling) \n 1940 /// 0x08: Equal (unordered, non-signaling) \n 1941 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n 1942 /// 0x0A: Not-greater-than (unordered, signaling) \n 1943 /// 0x0B: False (ordered, non-signaling) \n 1944 /// 0x0C: Not-equal (ordered, non-signaling) \n 1945 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n 1946 /// 0x0E: Greater-than (ordered, signaling) \n 1947 /// 0x0F: True (unordered, non-signaling) \n 1948 /// 0x10: Equal (ordered, signaling) \n 1949 /// 0x11: Less-than (ordered, non-signaling) \n 1950 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n 1951 /// 0x13: Unordered (signaling) \n 1952 /// 0x14: Not-equal (unordered, signaling) \n 1953 /// 0x15: Not-less-than (unordered, non-signaling) \n 1954 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n 1955 /// 0x17: Ordered (signaling) \n 1956 /// 0x18: Equal (unordered, signaling) \n 1957 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n 1958 /// 0x1A: Not-greater-than (unordered, non-signaling) \n 1959 /// 0x1B: False (ordered, signaling) \n 1960 /// 0x1C: Not-equal (ordered, signaling) \n 1961 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n 1962 /// 0x1E: Greater-than (ordered, non-signaling) \n 1963 /// 0x1F: True (unordered, signaling) 1964 /// \returns A 128-bit vector of [4 x float] containing the comparison results. 1965 /// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c) 1966 1967 /// Takes a [8 x i32] vector and returns the vector element value 1968 /// indexed by the immediate constant operand. 1969 /// 1970 /// \headerfile <x86intrin.h> 1971 /// 1972 /// \code 1973 /// int _mm256_extract_epi32(__m256i X, const int N); 1974 /// \endcode 1975 /// 1976 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1977 /// instruction. 1978 /// 1979 /// \param X 1980 /// A 256-bit vector of [8 x i32]. 1981 /// \param N 1982 /// An immediate integer operand with bits [2:0] determining which vector 1983 /// element is extracted and returned. 1984 /// \returns A 32-bit integer containing the extracted 32 bits of extended 1985 /// packed data. 1986 #define _mm256_extract_epi32(X, N) \ 1987 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))) 1988 1989 /// Takes a [16 x i16] vector and returns the vector element value 1990 /// indexed by the immediate constant operand. 1991 /// 1992 /// \headerfile <x86intrin.h> 1993 /// 1994 /// \code 1995 /// int _mm256_extract_epi16(__m256i X, const int N); 1996 /// \endcode 1997 /// 1998 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1999 /// instruction. 2000 /// 2001 /// \param X 2002 /// A 256-bit integer vector of [16 x i16]. 2003 /// \param N 2004 /// An immediate integer operand with bits [3:0] determining which vector 2005 /// element is extracted and returned. 2006 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2007 /// packed data. 2008 #define _mm256_extract_epi16(X, N) \ 2009 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ 2010 (int)(N))) 2011 2012 /// Takes a [32 x i8] vector and returns the vector element value 2013 /// indexed by the immediate constant operand. 2014 /// 2015 /// \headerfile <x86intrin.h> 2016 /// 2017 /// \code 2018 /// int _mm256_extract_epi8(__m256i X, const int N); 2019 /// \endcode 2020 /// 2021 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2022 /// instruction. 2023 /// 2024 /// \param X 2025 /// A 256-bit integer vector of [32 x i8]. 2026 /// \param N 2027 /// An immediate integer operand with bits [4:0] determining which vector 2028 /// element is extracted and returned. 2029 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2030 /// packed data. 2031 #define _mm256_extract_epi8(X, N) \ 2032 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ 2033 (int)(N))) 2034 2035 #ifdef __x86_64__ 2036 /// Takes a [4 x i64] vector and returns the vector element value 2037 /// indexed by the immediate constant operand. 2038 /// 2039 /// \headerfile <x86intrin.h> 2040 /// 2041 /// \code 2042 /// long long _mm256_extract_epi64(__m256i X, const int N); 2043 /// \endcode 2044 /// 2045 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2046 /// instruction. 2047 /// 2048 /// \param X 2049 /// A 256-bit integer vector of [4 x i64]. 2050 /// \param N 2051 /// An immediate integer operand with bits [1:0] determining which vector 2052 /// element is extracted and returned. 2053 /// \returns A 64-bit integer containing the extracted 64 bits of extended 2054 /// packed data. 2055 #define _mm256_extract_epi64(X, N) \ 2056 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))) 2057 #endif 2058 2059 /// Takes a [8 x i32] vector and replaces the vector element value 2060 /// indexed by the immediate constant operand by a new value. Returns the 2061 /// modified vector. 2062 /// 2063 /// \headerfile <x86intrin.h> 2064 /// 2065 /// \code 2066 /// __m256i _mm256_insert_epi32(__m256i X, int I, const int N); 2067 /// \endcode 2068 /// 2069 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2070 /// instruction. 2071 /// 2072 /// \param X 2073 /// A vector of [8 x i32] to be used by the insert operation. 2074 /// \param I 2075 /// An integer value. The replacement value for the insert operation. 2076 /// \param N 2077 /// An immediate integer specifying the index of the vector element to be 2078 /// replaced. 2079 /// \returns A copy of vector \a X, after replacing its element indexed by 2080 /// \a N with \a I. 2081 #define _mm256_insert_epi32(X, I, N) \ 2082 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ 2083 (int)(I), (int)(N))) 2084 2085 2086 /// Takes a [16 x i16] vector and replaces the vector element value 2087 /// indexed by the immediate constant operand with a new value. Returns the 2088 /// modified vector. 2089 /// 2090 /// \headerfile <x86intrin.h> 2091 /// 2092 /// \code 2093 /// __m256i _mm256_insert_epi16(__m256i X, int I, const int N); 2094 /// \endcode 2095 /// 2096 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2097 /// instruction. 2098 /// 2099 /// \param X 2100 /// A vector of [16 x i16] to be used by the insert operation. 2101 /// \param I 2102 /// An i16 integer value. The replacement value for the insert operation. 2103 /// \param N 2104 /// An immediate integer specifying the index of the vector element to be 2105 /// replaced. 2106 /// \returns A copy of vector \a X, after replacing its element indexed by 2107 /// \a N with \a I. 2108 #define _mm256_insert_epi16(X, I, N) \ 2109 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ 2110 (int)(I), (int)(N))) 2111 2112 /// Takes a [32 x i8] vector and replaces the vector element value 2113 /// indexed by the immediate constant operand with a new value. Returns the 2114 /// modified vector. 2115 /// 2116 /// \headerfile <x86intrin.h> 2117 /// 2118 /// \code 2119 /// __m256i _mm256_insert_epi8(__m256i X, int I, const int N); 2120 /// \endcode 2121 /// 2122 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2123 /// instruction. 2124 /// 2125 /// \param X 2126 /// A vector of [32 x i8] to be used by the insert operation. 2127 /// \param I 2128 /// An i8 integer value. The replacement value for the insert operation. 2129 /// \param N 2130 /// An immediate integer specifying the index of the vector element to be 2131 /// replaced. 2132 /// \returns A copy of vector \a X, after replacing its element indexed by 2133 /// \a N with \a I. 2134 #define _mm256_insert_epi8(X, I, N) \ 2135 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ 2136 (int)(I), (int)(N))) 2137 2138 #ifdef __x86_64__ 2139 /// Takes a [4 x i64] vector and replaces the vector element value 2140 /// indexed by the immediate constant operand with a new value. Returns the 2141 /// modified vector. 2142 /// 2143 /// \headerfile <x86intrin.h> 2144 /// 2145 /// \code 2146 /// __m256i _mm256_insert_epi64(__m256i X, int I, const int N); 2147 /// \endcode 2148 /// 2149 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2150 /// instruction. 2151 /// 2152 /// \param X 2153 /// A vector of [4 x i64] to be used by the insert operation. 2154 /// \param I 2155 /// A 64-bit integer value. The replacement value for the insert operation. 2156 /// \param N 2157 /// An immediate integer specifying the index of the vector element to be 2158 /// replaced. 2159 /// \returns A copy of vector \a X, after replacing its element indexed by 2160 /// \a N with \a I. 2161 #define _mm256_insert_epi64(X, I, N) \ 2162 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ 2163 (long long)(I), (int)(N))) 2164 #endif 2165 2166 /* Conversion */ 2167 /// Converts a vector of [4 x i32] into a vector of [4 x double]. 2168 /// 2169 /// \headerfile <x86intrin.h> 2170 /// 2171 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2172 /// 2173 /// \param __a 2174 /// A 128-bit integer vector of [4 x i32]. 2175 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2176 static __inline __m256d __DEFAULT_FN_ATTRS 2177 _mm256_cvtepi32_pd(__m128i __a) 2178 { 2179 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2180 } 2181 2182 /// Converts a vector of [8 x i32] into a vector of [8 x float]. 2183 /// 2184 /// \headerfile <x86intrin.h> 2185 /// 2186 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2187 /// 2188 /// \param __a 2189 /// A 256-bit integer vector. 2190 /// \returns A 256-bit vector of [8 x float] containing the converted values. 2191 static __inline __m256 __DEFAULT_FN_ATTRS 2192 _mm256_cvtepi32_ps(__m256i __a) 2193 { 2194 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); 2195 } 2196 2197 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2198 /// [4 x float]. 2199 /// 2200 /// \headerfile <x86intrin.h> 2201 /// 2202 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2203 /// 2204 /// \param __a 2205 /// A 256-bit vector of [4 x double]. 2206 /// \returns A 128-bit vector of [4 x float] containing the converted values. 2207 static __inline __m128 __DEFAULT_FN_ATTRS 2208 _mm256_cvtpd_ps(__m256d __a) 2209 { 2210 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2211 } 2212 2213 /// Converts a vector of [8 x float] into a vector of [8 x i32]. 2214 /// 2215 /// If a converted value does not fit in a 32-bit integer, raises a 2216 /// floating-point invalid exception. If the exception is masked, returns 2217 /// the most negative integer. 2218 /// 2219 /// \headerfile <x86intrin.h> 2220 /// 2221 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2222 /// 2223 /// \param __a 2224 /// A 256-bit vector of [8 x float]. 2225 /// \returns A 256-bit integer vector containing the converted values. 2226 static __inline __m256i __DEFAULT_FN_ATTRS 2227 _mm256_cvtps_epi32(__m256 __a) 2228 { 2229 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2230 } 2231 2232 /// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2233 /// x double]. 2234 /// 2235 /// \headerfile <x86intrin.h> 2236 /// 2237 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2238 /// 2239 /// \param __a 2240 /// A 128-bit vector of [4 x float]. 2241 /// \returns A 256-bit vector of [4 x double] containing the converted values. 2242 static __inline __m256d __DEFAULT_FN_ATTRS 2243 _mm256_cvtps_pd(__m128 __a) 2244 { 2245 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2246 } 2247 2248 /// Converts a 256-bit vector of [4 x double] into four signed truncated 2249 /// (rounded toward zero) 32-bit integers returned in a 128-bit vector of 2250 /// [4 x i32]. 2251 /// 2252 /// If a converted value does not fit in a 32-bit integer, raises a 2253 /// floating-point invalid exception. If the exception is masked, returns 2254 /// the most negative integer. 2255 /// 2256 /// \headerfile <x86intrin.h> 2257 /// 2258 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2259 /// 2260 /// \param __a 2261 /// A 256-bit vector of [4 x double]. 2262 /// \returns A 128-bit integer vector containing the converted values. 2263 static __inline __m128i __DEFAULT_FN_ATTRS 2264 _mm256_cvttpd_epi32(__m256d __a) 2265 { 2266 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2267 } 2268 2269 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2270 /// [4 x i32]. 2271 /// 2272 /// If a converted value does not fit in a 32-bit integer, raises a 2273 /// floating-point invalid exception. If the exception is masked, returns 2274 /// the most negative integer. 2275 /// 2276 /// \headerfile <x86intrin.h> 2277 /// 2278 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2279 /// 2280 /// \param __a 2281 /// A 256-bit vector of [4 x double]. 2282 /// \returns A 128-bit integer vector containing the converted values. 2283 static __inline __m128i __DEFAULT_FN_ATTRS 2284 _mm256_cvtpd_epi32(__m256d __a) 2285 { 2286 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2287 } 2288 2289 /// Converts a vector of [8 x float] into eight signed truncated (rounded 2290 /// toward zero) 32-bit integers returned in a vector of [8 x i32]. 2291 /// 2292 /// If a converted value does not fit in a 32-bit integer, raises a 2293 /// floating-point invalid exception. If the exception is masked, returns 2294 /// the most negative integer. 2295 /// 2296 /// \headerfile <x86intrin.h> 2297 /// 2298 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2299 /// 2300 /// \param __a 2301 /// A 256-bit vector of [8 x float]. 2302 /// \returns A 256-bit integer vector containing the converted values. 2303 static __inline __m256i __DEFAULT_FN_ATTRS 2304 _mm256_cvttps_epi32(__m256 __a) 2305 { 2306 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2307 } 2308 2309 /// Returns the first element of the input vector of [4 x double]. 2310 /// 2311 /// \headerfile <x86intrin.h> 2312 /// 2313 /// This intrinsic is a utility function and does not correspond to a specific 2314 /// instruction. 2315 /// 2316 /// \param __a 2317 /// A 256-bit vector of [4 x double]. 2318 /// \returns A 64 bit double containing the first element of the input vector. 2319 static __inline double __DEFAULT_FN_ATTRS 2320 _mm256_cvtsd_f64(__m256d __a) 2321 { 2322 return __a[0]; 2323 } 2324 2325 /// Returns the first element of the input vector of [8 x i32]. 2326 /// 2327 /// \headerfile <x86intrin.h> 2328 /// 2329 /// This intrinsic is a utility function and does not correspond to a specific 2330 /// instruction. 2331 /// 2332 /// \param __a 2333 /// A 256-bit vector of [8 x i32]. 2334 /// \returns A 32 bit integer containing the first element of the input vector. 2335 static __inline int __DEFAULT_FN_ATTRS 2336 _mm256_cvtsi256_si32(__m256i __a) 2337 { 2338 __v8si __b = (__v8si)__a; 2339 return __b[0]; 2340 } 2341 2342 /// Returns the first element of the input vector of [8 x float]. 2343 /// 2344 /// \headerfile <x86intrin.h> 2345 /// 2346 /// This intrinsic is a utility function and does not correspond to a specific 2347 /// instruction. 2348 /// 2349 /// \param __a 2350 /// A 256-bit vector of [8 x float]. 2351 /// \returns A 32 bit float containing the first element of the input vector. 2352 static __inline float __DEFAULT_FN_ATTRS 2353 _mm256_cvtss_f32(__m256 __a) 2354 { 2355 return __a[0]; 2356 } 2357 2358 /* Vector replicate */ 2359 /// Moves and duplicates odd-indexed values from a 256-bit vector of 2360 /// [8 x float] to float values in a 256-bit vector of [8 x float]. 2361 /// 2362 /// \headerfile <x86intrin.h> 2363 /// 2364 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2365 /// 2366 /// \param __a 2367 /// A 256-bit vector of [8 x float]. \n 2368 /// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2369 /// the return value. \n 2370 /// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2371 /// the return value. \n 2372 /// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2373 /// return value. \n 2374 /// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2375 /// return value. 2376 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2377 /// values. 2378 static __inline __m256 __DEFAULT_FN_ATTRS 2379 _mm256_movehdup_ps(__m256 __a) 2380 { 2381 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2382 } 2383 2384 /// Moves and duplicates even-indexed values from a 256-bit vector of 2385 /// [8 x float] to float values in a 256-bit vector of [8 x float]. 2386 /// 2387 /// \headerfile <x86intrin.h> 2388 /// 2389 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2390 /// 2391 /// \param __a 2392 /// A 256-bit vector of [8 x float]. \n 2393 /// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2394 /// the return value. \n 2395 /// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2396 /// the return value. \n 2397 /// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2398 /// return value. \n 2399 /// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2400 /// return value. 2401 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2402 /// values. 2403 static __inline __m256 __DEFAULT_FN_ATTRS 2404 _mm256_moveldup_ps(__m256 __a) 2405 { 2406 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2407 } 2408 2409 /// Moves and duplicates double-precision floating point values from a 2410 /// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2411 /// vector of [4 x double]. 2412 /// 2413 /// \headerfile <x86intrin.h> 2414 /// 2415 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2416 /// 2417 /// \param __a 2418 /// A 256-bit vector of [4 x double]. \n 2419 /// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2420 /// return value. \n 2421 /// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2422 /// the return value. 2423 /// \returns A 256-bit vector of [4 x double] containing the moved and 2424 /// duplicated values. 2425 static __inline __m256d __DEFAULT_FN_ATTRS 2426 _mm256_movedup_pd(__m256d __a) 2427 { 2428 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2429 } 2430 2431 /* Unpack and Interleave */ 2432 /// Unpacks the odd-indexed vector elements from two 256-bit vectors of 2433 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2434 /// 2435 /// \headerfile <x86intrin.h> 2436 /// 2437 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2438 /// 2439 /// \param __a 2440 /// A 256-bit floating-point vector of [4 x double]. \n 2441 /// Bits [127:64] are written to bits [63:0] of the return value. \n 2442 /// Bits [255:192] are written to bits [191:128] of the return value. \n 2443 /// \param __b 2444 /// A 256-bit floating-point vector of [4 x double]. \n 2445 /// Bits [127:64] are written to bits [127:64] of the return value. \n 2446 /// Bits [255:192] are written to bits [255:192] of the return value. \n 2447 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2448 static __inline __m256d __DEFAULT_FN_ATTRS 2449 _mm256_unpackhi_pd(__m256d __a, __m256d __b) 2450 { 2451 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2452 } 2453 2454 /// Unpacks the even-indexed vector elements from two 256-bit vectors of 2455 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2456 /// 2457 /// \headerfile <x86intrin.h> 2458 /// 2459 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2460 /// 2461 /// \param __a 2462 /// A 256-bit floating-point vector of [4 x double]. \n 2463 /// Bits [63:0] are written to bits [63:0] of the return value. \n 2464 /// Bits [191:128] are written to bits [191:128] of the return value. 2465 /// \param __b 2466 /// A 256-bit floating-point vector of [4 x double]. \n 2467 /// Bits [63:0] are written to bits [127:64] of the return value. \n 2468 /// Bits [191:128] are written to bits [255:192] of the return value. \n 2469 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2470 static __inline __m256d __DEFAULT_FN_ATTRS 2471 _mm256_unpacklo_pd(__m256d __a, __m256d __b) 2472 { 2473 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2474 } 2475 2476 /// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2477 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2478 /// vector of [8 x float]. 2479 /// 2480 /// \headerfile <x86intrin.h> 2481 /// 2482 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2483 /// 2484 /// \param __a 2485 /// A 256-bit vector of [8 x float]. \n 2486 /// Bits [95:64] are written to bits [31:0] of the return value. \n 2487 /// Bits [127:96] are written to bits [95:64] of the return value. \n 2488 /// Bits [223:192] are written to bits [159:128] of the return value. \n 2489 /// Bits [255:224] are written to bits [223:192] of the return value. 2490 /// \param __b 2491 /// A 256-bit vector of [8 x float]. \n 2492 /// Bits [95:64] are written to bits [63:32] of the return value. \n 2493 /// Bits [127:96] are written to bits [127:96] of the return value. \n 2494 /// Bits [223:192] are written to bits [191:160] of the return value. \n 2495 /// Bits [255:224] are written to bits [255:224] of the return value. 2496 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2497 static __inline __m256 __DEFAULT_FN_ATTRS 2498 _mm256_unpackhi_ps(__m256 __a, __m256 __b) 2499 { 2500 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2501 } 2502 2503 /// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2504 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2505 /// vector of [8 x float]. 2506 /// 2507 /// \headerfile <x86intrin.h> 2508 /// 2509 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2510 /// 2511 /// \param __a 2512 /// A 256-bit vector of [8 x float]. \n 2513 /// Bits [31:0] are written to bits [31:0] of the return value. \n 2514 /// Bits [63:32] are written to bits [95:64] of the return value. \n 2515 /// Bits [159:128] are written to bits [159:128] of the return value. \n 2516 /// Bits [191:160] are written to bits [223:192] of the return value. 2517 /// \param __b 2518 /// A 256-bit vector of [8 x float]. \n 2519 /// Bits [31:0] are written to bits [63:32] of the return value. \n 2520 /// Bits [63:32] are written to bits [127:96] of the return value. \n 2521 /// Bits [159:128] are written to bits [191:160] of the return value. \n 2522 /// Bits [191:160] are written to bits [255:224] of the return value. 2523 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2524 static __inline __m256 __DEFAULT_FN_ATTRS 2525 _mm256_unpacklo_ps(__m256 __a, __m256 __b) 2526 { 2527 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2528 } 2529 2530 /* Bit Test */ 2531 /// Given two 128-bit floating-point vectors of [2 x double], perform an 2532 /// element-by-element comparison of the double-precision element in the 2533 /// first source vector and the corresponding element in the second source 2534 /// vector. 2535 /// 2536 /// The EFLAGS register is updated as follows: \n 2537 /// If there is at least one pair of double-precision elements where the 2538 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2539 /// ZF flag is set to 1. \n 2540 /// If there is at least one pair of double-precision elements where the 2541 /// sign-bit of the first element is 0 and the sign-bit of the second element 2542 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2543 /// This intrinsic returns the value of the ZF flag. 2544 /// 2545 /// \headerfile <x86intrin.h> 2546 /// 2547 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2548 /// 2549 /// \param __a 2550 /// A 128-bit vector of [2 x double]. 2551 /// \param __b 2552 /// A 128-bit vector of [2 x double]. 2553 /// \returns the ZF flag in the EFLAGS register. 2554 static __inline int __DEFAULT_FN_ATTRS128 2555 _mm_testz_pd(__m128d __a, __m128d __b) 2556 { 2557 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2558 } 2559 2560 /// Given two 128-bit floating-point vectors of [2 x double], perform an 2561 /// element-by-element comparison of the double-precision element in the 2562 /// first source vector and the corresponding element in the second source 2563 /// vector. 2564 /// 2565 /// The EFLAGS register is updated as follows: \n 2566 /// If there is at least one pair of double-precision elements where the 2567 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2568 /// ZF flag is set to 1. \n 2569 /// If there is at least one pair of double-precision elements where the 2570 /// sign-bit of the first element is 0 and the sign-bit of the second element 2571 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2572 /// This intrinsic returns the value of the CF flag. 2573 /// 2574 /// \headerfile <x86intrin.h> 2575 /// 2576 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2577 /// 2578 /// \param __a 2579 /// A 128-bit vector of [2 x double]. 2580 /// \param __b 2581 /// A 128-bit vector of [2 x double]. 2582 /// \returns the CF flag in the EFLAGS register. 2583 static __inline int __DEFAULT_FN_ATTRS128 2584 _mm_testc_pd(__m128d __a, __m128d __b) 2585 { 2586 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2587 } 2588 2589 /// Given two 128-bit floating-point vectors of [2 x double], perform an 2590 /// element-by-element comparison of the double-precision element in the 2591 /// first source vector and the corresponding element in the second source 2592 /// vector. 2593 /// 2594 /// The EFLAGS register is updated as follows: \n 2595 /// If there is at least one pair of double-precision elements where the 2596 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2597 /// ZF flag is set to 1. \n 2598 /// If there is at least one pair of double-precision elements where the 2599 /// sign-bit of the first element is 0 and the sign-bit of the second element 2600 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2601 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2602 /// otherwise it returns 0. 2603 /// 2604 /// \headerfile <x86intrin.h> 2605 /// 2606 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2607 /// 2608 /// \param __a 2609 /// A 128-bit vector of [2 x double]. 2610 /// \param __b 2611 /// A 128-bit vector of [2 x double]. 2612 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2613 static __inline int __DEFAULT_FN_ATTRS128 2614 _mm_testnzc_pd(__m128d __a, __m128d __b) 2615 { 2616 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2617 } 2618 2619 /// Given two 128-bit floating-point vectors of [4 x float], perform an 2620 /// element-by-element comparison of the single-precision element in the 2621 /// first source vector and the corresponding element in the second source 2622 /// vector. 2623 /// 2624 /// The EFLAGS register is updated as follows: \n 2625 /// If there is at least one pair of single-precision elements where the 2626 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2627 /// ZF flag is set to 1. \n 2628 /// If there is at least one pair of single-precision elements where the 2629 /// sign-bit of the first element is 0 and the sign-bit of the second element 2630 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2631 /// This intrinsic returns the value of the ZF flag. 2632 /// 2633 /// \headerfile <x86intrin.h> 2634 /// 2635 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2636 /// 2637 /// \param __a 2638 /// A 128-bit vector of [4 x float]. 2639 /// \param __b 2640 /// A 128-bit vector of [4 x float]. 2641 /// \returns the ZF flag. 2642 static __inline int __DEFAULT_FN_ATTRS128 2643 _mm_testz_ps(__m128 __a, __m128 __b) 2644 { 2645 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2646 } 2647 2648 /// Given two 128-bit floating-point vectors of [4 x float], perform an 2649 /// element-by-element comparison of the single-precision element in the 2650 /// first source vector and the corresponding element in the second source 2651 /// vector. 2652 /// 2653 /// The EFLAGS register is updated as follows: \n 2654 /// If there is at least one pair of single-precision elements where the 2655 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2656 /// ZF flag is set to 1. \n 2657 /// If there is at least one pair of single-precision elements where the 2658 /// sign-bit of the first element is 0 and the sign-bit of the second element 2659 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2660 /// This intrinsic returns the value of the CF flag. 2661 /// 2662 /// \headerfile <x86intrin.h> 2663 /// 2664 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2665 /// 2666 /// \param __a 2667 /// A 128-bit vector of [4 x float]. 2668 /// \param __b 2669 /// A 128-bit vector of [4 x float]. 2670 /// \returns the CF flag. 2671 static __inline int __DEFAULT_FN_ATTRS128 2672 _mm_testc_ps(__m128 __a, __m128 __b) 2673 { 2674 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2675 } 2676 2677 /// Given two 128-bit floating-point vectors of [4 x float], perform an 2678 /// element-by-element comparison of the single-precision element in the 2679 /// first source vector and the corresponding element in the second source 2680 /// vector. 2681 /// 2682 /// The EFLAGS register is updated as follows: \n 2683 /// If there is at least one pair of single-precision elements where the 2684 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2685 /// ZF flag is set to 1. \n 2686 /// If there is at least one pair of single-precision elements where the 2687 /// sign-bit of the first element is 0 and the sign-bit of the second element 2688 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2689 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2690 /// otherwise it returns 0. 2691 /// 2692 /// \headerfile <x86intrin.h> 2693 /// 2694 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2695 /// 2696 /// \param __a 2697 /// A 128-bit vector of [4 x float]. 2698 /// \param __b 2699 /// A 128-bit vector of [4 x float]. 2700 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2701 static __inline int __DEFAULT_FN_ATTRS128 2702 _mm_testnzc_ps(__m128 __a, __m128 __b) 2703 { 2704 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2705 } 2706 2707 /// Given two 256-bit floating-point vectors of [4 x double], perform an 2708 /// element-by-element comparison of the double-precision elements in the 2709 /// first source vector and the corresponding elements in the second source 2710 /// vector. 2711 /// 2712 /// The EFLAGS register is updated as follows: \n 2713 /// If there is at least one pair of double-precision elements where the 2714 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2715 /// ZF flag is set to 1. \n 2716 /// If there is at least one pair of double-precision elements where the 2717 /// sign-bit of the first element is 0 and the sign-bit of the second element 2718 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2719 /// This intrinsic returns the value of the ZF flag. 2720 /// 2721 /// \headerfile <x86intrin.h> 2722 /// 2723 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2724 /// 2725 /// \param __a 2726 /// A 256-bit vector of [4 x double]. 2727 /// \param __b 2728 /// A 256-bit vector of [4 x double]. 2729 /// \returns the ZF flag. 2730 static __inline int __DEFAULT_FN_ATTRS 2731 _mm256_testz_pd(__m256d __a, __m256d __b) 2732 { 2733 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2734 } 2735 2736 /// Given two 256-bit floating-point vectors of [4 x double], perform an 2737 /// element-by-element comparison of the double-precision elements in the 2738 /// first source vector and the corresponding elements in the second source 2739 /// vector. 2740 /// 2741 /// The EFLAGS register is updated as follows: \n 2742 /// If there is at least one pair of double-precision elements where the 2743 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2744 /// ZF flag is set to 1. \n 2745 /// If there is at least one pair of double-precision elements where the 2746 /// sign-bit of the first element is 0 and the sign-bit of the second element 2747 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2748 /// This intrinsic returns the value of the CF flag. 2749 /// 2750 /// \headerfile <x86intrin.h> 2751 /// 2752 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2753 /// 2754 /// \param __a 2755 /// A 256-bit vector of [4 x double]. 2756 /// \param __b 2757 /// A 256-bit vector of [4 x double]. 2758 /// \returns the CF flag. 2759 static __inline int __DEFAULT_FN_ATTRS 2760 _mm256_testc_pd(__m256d __a, __m256d __b) 2761 { 2762 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2763 } 2764 2765 /// Given two 256-bit floating-point vectors of [4 x double], perform an 2766 /// element-by-element comparison of the double-precision elements in the 2767 /// first source vector and the corresponding elements in the second source 2768 /// vector. 2769 /// 2770 /// The EFLAGS register is updated as follows: \n 2771 /// If there is at least one pair of double-precision elements where the 2772 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2773 /// ZF flag is set to 1. \n 2774 /// If there is at least one pair of double-precision elements where the 2775 /// sign-bit of the first element is 0 and the sign-bit of the second element 2776 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2777 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2778 /// otherwise it returns 0. 2779 /// 2780 /// \headerfile <x86intrin.h> 2781 /// 2782 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2783 /// 2784 /// \param __a 2785 /// A 256-bit vector of [4 x double]. 2786 /// \param __b 2787 /// A 256-bit vector of [4 x double]. 2788 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2789 static __inline int __DEFAULT_FN_ATTRS 2790 _mm256_testnzc_pd(__m256d __a, __m256d __b) 2791 { 2792 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2793 } 2794 2795 /// Given two 256-bit floating-point vectors of [8 x float], perform an 2796 /// element-by-element comparison of the single-precision element in the 2797 /// first source vector and the corresponding element in the second source 2798 /// vector. 2799 /// 2800 /// The EFLAGS register is updated as follows: \n 2801 /// If there is at least one pair of single-precision elements where the 2802 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2803 /// ZF flag is set to 1. \n 2804 /// If there is at least one pair of single-precision elements where the 2805 /// sign-bit of the first element is 0 and the sign-bit of the second element 2806 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2807 /// This intrinsic returns the value of the ZF flag. 2808 /// 2809 /// \headerfile <x86intrin.h> 2810 /// 2811 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2812 /// 2813 /// \param __a 2814 /// A 256-bit vector of [8 x float]. 2815 /// \param __b 2816 /// A 256-bit vector of [8 x float]. 2817 /// \returns the ZF flag. 2818 static __inline int __DEFAULT_FN_ATTRS 2819 _mm256_testz_ps(__m256 __a, __m256 __b) 2820 { 2821 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2822 } 2823 2824 /// Given two 256-bit floating-point vectors of [8 x float], perform an 2825 /// element-by-element comparison of the single-precision element in the 2826 /// first source vector and the corresponding element in the second source 2827 /// vector. 2828 /// 2829 /// The EFLAGS register is updated as follows: \n 2830 /// If there is at least one pair of single-precision elements where the 2831 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2832 /// ZF flag is set to 1. \n 2833 /// If there is at least one pair of single-precision elements where the 2834 /// sign-bit of the first element is 0 and the sign-bit of the second element 2835 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2836 /// This intrinsic returns the value of the CF flag. 2837 /// 2838 /// \headerfile <x86intrin.h> 2839 /// 2840 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2841 /// 2842 /// \param __a 2843 /// A 256-bit vector of [8 x float]. 2844 /// \param __b 2845 /// A 256-bit vector of [8 x float]. 2846 /// \returns the CF flag. 2847 static __inline int __DEFAULT_FN_ATTRS 2848 _mm256_testc_ps(__m256 __a, __m256 __b) 2849 { 2850 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2851 } 2852 2853 /// Given two 256-bit floating-point vectors of [8 x float], perform an 2854 /// element-by-element comparison of the single-precision elements in the 2855 /// first source vector and the corresponding elements in the second source 2856 /// vector. 2857 /// 2858 /// The EFLAGS register is updated as follows: \n 2859 /// If there is at least one pair of single-precision elements where the 2860 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2861 /// ZF flag is set to 1. \n 2862 /// If there is at least one pair of single-precision elements where the 2863 /// sign-bit of the first element is 0 and the sign-bit of the second element 2864 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2865 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2866 /// otherwise it returns 0. 2867 /// 2868 /// \headerfile <x86intrin.h> 2869 /// 2870 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2871 /// 2872 /// \param __a 2873 /// A 256-bit vector of [8 x float]. 2874 /// \param __b 2875 /// A 256-bit vector of [8 x float]. 2876 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2877 static __inline int __DEFAULT_FN_ATTRS 2878 _mm256_testnzc_ps(__m256 __a, __m256 __b) 2879 { 2880 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2881 } 2882 2883 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2884 /// of the two source vectors. 2885 /// 2886 /// The EFLAGS register is updated as follows: \n 2887 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2888 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2889 /// If there is at least one pair of bits where the bit from the first source 2890 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2891 /// is set to 0. Otherwise the CF flag is set to 1. \n 2892 /// This intrinsic returns the value of the ZF flag. 2893 /// 2894 /// \headerfile <x86intrin.h> 2895 /// 2896 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2897 /// 2898 /// \param __a 2899 /// A 256-bit integer vector. 2900 /// \param __b 2901 /// A 256-bit integer vector. 2902 /// \returns the ZF flag. 2903 static __inline int __DEFAULT_FN_ATTRS 2904 _mm256_testz_si256(__m256i __a, __m256i __b) 2905 { 2906 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2907 } 2908 2909 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2910 /// of the two source vectors. 2911 /// 2912 /// The EFLAGS register is updated as follows: \n 2913 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2914 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2915 /// If there is at least one pair of bits where the bit from the first source 2916 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2917 /// is set to 0. Otherwise the CF flag is set to 1. \n 2918 /// This intrinsic returns the value of the CF flag. 2919 /// 2920 /// \headerfile <x86intrin.h> 2921 /// 2922 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2923 /// 2924 /// \param __a 2925 /// A 256-bit integer vector. 2926 /// \param __b 2927 /// A 256-bit integer vector. 2928 /// \returns the CF flag. 2929 static __inline int __DEFAULT_FN_ATTRS 2930 _mm256_testc_si256(__m256i __a, __m256i __b) 2931 { 2932 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2933 } 2934 2935 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison 2936 /// of the two source vectors. 2937 /// 2938 /// The EFLAGS register is updated as follows: \n 2939 /// If there is at least one pair of bits where both bits are 1, the ZF flag 2940 /// is set to 0. Otherwise the ZF flag is set to 1. \n 2941 /// If there is at least one pair of bits where the bit from the first source 2942 /// vector is 0 and the bit from the second source vector is 1, the CF flag 2943 /// is set to 0. Otherwise the CF flag is set to 1. \n 2944 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2945 /// otherwise it returns 0. 2946 /// 2947 /// \headerfile <x86intrin.h> 2948 /// 2949 /// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2950 /// 2951 /// \param __a 2952 /// A 256-bit integer vector. 2953 /// \param __b 2954 /// A 256-bit integer vector. 2955 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2956 static __inline int __DEFAULT_FN_ATTRS 2957 _mm256_testnzc_si256(__m256i __a, __m256i __b) 2958 { 2959 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2960 } 2961 2962 /* Vector extract sign mask */ 2963 /// Extracts the sign bits of double-precision floating point elements 2964 /// in a 256-bit vector of [4 x double] and writes them to the lower order 2965 /// bits of the return value. 2966 /// 2967 /// \headerfile <x86intrin.h> 2968 /// 2969 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2970 /// 2971 /// \param __a 2972 /// A 256-bit vector of [4 x double] containing the double-precision 2973 /// floating point values with sign bits to be extracted. 2974 /// \returns The sign bits from the operand, written to bits [3:0]. 2975 static __inline int __DEFAULT_FN_ATTRS 2976 _mm256_movemask_pd(__m256d __a) 2977 { 2978 return __builtin_ia32_movmskpd256((__v4df)__a); 2979 } 2980 2981 /// Extracts the sign bits of single-precision floating point elements 2982 /// in a 256-bit vector of [8 x float] and writes them to the lower order 2983 /// bits of the return value. 2984 /// 2985 /// \headerfile <x86intrin.h> 2986 /// 2987 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 2988 /// 2989 /// \param __a 2990 /// A 256-bit vector of [8 x float] containing the single-precision floating 2991 /// point values with sign bits to be extracted. 2992 /// \returns The sign bits from the operand, written to bits [7:0]. 2993 static __inline int __DEFAULT_FN_ATTRS 2994 _mm256_movemask_ps(__m256 __a) 2995 { 2996 return __builtin_ia32_movmskps256((__v8sf)__a); 2997 } 2998 2999 /* Vector __zero */ 3000 /// Zeroes the contents of all XMM or YMM registers. 3001 /// 3002 /// \headerfile <x86intrin.h> 3003 /// 3004 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 3005 static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 3006 _mm256_zeroall(void) 3007 { 3008 __builtin_ia32_vzeroall(); 3009 } 3010 3011 /// Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 3012 /// 3013 /// \headerfile <x86intrin.h> 3014 /// 3015 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 3016 static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 3017 _mm256_zeroupper(void) 3018 { 3019 __builtin_ia32_vzeroupper(); 3020 } 3021 3022 /* Vector load with broadcast */ 3023 /// Loads a scalar single-precision floating point value from the 3024 /// specified address pointed to by \a __a and broadcasts it to the elements 3025 /// of a [4 x float] vector. 3026 /// 3027 /// \headerfile <x86intrin.h> 3028 /// 3029 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3030 /// 3031 /// \param __a 3032 /// The single-precision floating point value to be broadcast. 3033 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3034 /// equal to the broadcast value. 3035 static __inline __m128 __DEFAULT_FN_ATTRS128 3036 _mm_broadcast_ss(float const *__a) 3037 { 3038 struct __mm_broadcast_ss_struct { 3039 float __f; 3040 } __attribute__((__packed__, __may_alias__)); 3041 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f; 3042 return __extension__ (__m128){ __f, __f, __f, __f }; 3043 } 3044 3045 /// Loads a scalar double-precision floating point value from the 3046 /// specified address pointed to by \a __a and broadcasts it to the elements 3047 /// of a [4 x double] vector. 3048 /// 3049 /// \headerfile <x86intrin.h> 3050 /// 3051 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3052 /// 3053 /// \param __a 3054 /// The double-precision floating point value to be broadcast. 3055 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3056 /// equal to the broadcast value. 3057 static __inline __m256d __DEFAULT_FN_ATTRS 3058 _mm256_broadcast_sd(double const *__a) 3059 { 3060 struct __mm256_broadcast_sd_struct { 3061 double __d; 3062 } __attribute__((__packed__, __may_alias__)); 3063 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d; 3064 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d }; 3065 } 3066 3067 /// Loads a scalar single-precision floating point value from the 3068 /// specified address pointed to by \a __a and broadcasts it to the elements 3069 /// of a [8 x float] vector. 3070 /// 3071 /// \headerfile <x86intrin.h> 3072 /// 3073 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3074 /// 3075 /// \param __a 3076 /// The single-precision floating point value to be broadcast. 3077 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3078 /// equal to the broadcast value. 3079 static __inline __m256 __DEFAULT_FN_ATTRS 3080 _mm256_broadcast_ss(float const *__a) 3081 { 3082 struct __mm256_broadcast_ss_struct { 3083 float __f; 3084 } __attribute__((__packed__, __may_alias__)); 3085 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f; 3086 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3087 } 3088 3089 /// Loads the data from a 128-bit vector of [2 x double] from the 3090 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3091 /// elements in a 256-bit vector of [4 x double]. 3092 /// 3093 /// \headerfile <x86intrin.h> 3094 /// 3095 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3096 /// 3097 /// \param __a 3098 /// The 128-bit vector of [2 x double] to be broadcast. 3099 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3100 /// equal to the broadcast value. 3101 static __inline __m256d __DEFAULT_FN_ATTRS 3102 _mm256_broadcast_pd(__m128d const *__a) 3103 { 3104 __m128d __b = _mm_loadu_pd((const double *)__a); 3105 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, 3106 0, 1, 0, 1); 3107 } 3108 3109 /// Loads the data from a 128-bit vector of [4 x float] from the 3110 /// specified address pointed to by \a __a and broadcasts it to 128-bit 3111 /// elements in a 256-bit vector of [8 x float]. 3112 /// 3113 /// \headerfile <x86intrin.h> 3114 /// 3115 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3116 /// 3117 /// \param __a 3118 /// The 128-bit vector of [4 x float] to be broadcast. 3119 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3120 /// equal to the broadcast value. 3121 static __inline __m256 __DEFAULT_FN_ATTRS 3122 _mm256_broadcast_ps(__m128 const *__a) 3123 { 3124 __m128 __b = _mm_loadu_ps((const float *)__a); 3125 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, 3126 0, 1, 2, 3, 0, 1, 2, 3); 3127 } 3128 3129 /* SIMD load ops */ 3130 /// Loads 4 double-precision floating point values from a 32-byte aligned 3131 /// memory location pointed to by \a __p into a vector of [4 x double]. 3132 /// 3133 /// \headerfile <x86intrin.h> 3134 /// 3135 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3136 /// 3137 /// \param __p 3138 /// A 32-byte aligned pointer to a memory location containing 3139 /// double-precision floating point values. 3140 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3141 static __inline __m256d __DEFAULT_FN_ATTRS 3142 _mm256_load_pd(double const *__p) 3143 { 3144 return *(const __m256d *)__p; 3145 } 3146 3147 /// Loads 8 single-precision floating point values from a 32-byte aligned 3148 /// memory location pointed to by \a __p into a vector of [8 x float]. 3149 /// 3150 /// \headerfile <x86intrin.h> 3151 /// 3152 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3153 /// 3154 /// \param __p 3155 /// A 32-byte aligned pointer to a memory location containing float values. 3156 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3157 static __inline __m256 __DEFAULT_FN_ATTRS 3158 _mm256_load_ps(float const *__p) 3159 { 3160 return *(const __m256 *)__p; 3161 } 3162 3163 /// Loads 4 double-precision floating point values from an unaligned 3164 /// memory location pointed to by \a __p into a vector of [4 x double]. 3165 /// 3166 /// \headerfile <x86intrin.h> 3167 /// 3168 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3169 /// 3170 /// \param __p 3171 /// A pointer to a memory location containing double-precision floating 3172 /// point values. 3173 /// \returns A 256-bit vector of [4 x double] containing the moved values. 3174 static __inline __m256d __DEFAULT_FN_ATTRS 3175 _mm256_loadu_pd(double const *__p) 3176 { 3177 struct __loadu_pd { 3178 __m256d_u __v; 3179 } __attribute__((__packed__, __may_alias__)); 3180 return ((const struct __loadu_pd*)__p)->__v; 3181 } 3182 3183 /// Loads 8 single-precision floating point values from an unaligned 3184 /// memory location pointed to by \a __p into a vector of [8 x float]. 3185 /// 3186 /// \headerfile <x86intrin.h> 3187 /// 3188 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3189 /// 3190 /// \param __p 3191 /// A pointer to a memory location containing single-precision floating 3192 /// point values. 3193 /// \returns A 256-bit vector of [8 x float] containing the moved values. 3194 static __inline __m256 __DEFAULT_FN_ATTRS 3195 _mm256_loadu_ps(float const *__p) 3196 { 3197 struct __loadu_ps { 3198 __m256_u __v; 3199 } __attribute__((__packed__, __may_alias__)); 3200 return ((const struct __loadu_ps*)__p)->__v; 3201 } 3202 3203 /// Loads 256 bits of integer data from a 32-byte aligned memory 3204 /// location pointed to by \a __p into elements of a 256-bit integer vector. 3205 /// 3206 /// \headerfile <x86intrin.h> 3207 /// 3208 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3209 /// 3210 /// \param __p 3211 /// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3212 /// values. 3213 /// \returns A 256-bit integer vector containing the moved values. 3214 static __inline __m256i __DEFAULT_FN_ATTRS 3215 _mm256_load_si256(__m256i const *__p) 3216 { 3217 return *__p; 3218 } 3219 3220 /// Loads 256 bits of integer data from an unaligned memory location 3221 /// pointed to by \a __p into a 256-bit integer vector. 3222 /// 3223 /// \headerfile <x86intrin.h> 3224 /// 3225 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3226 /// 3227 /// \param __p 3228 /// A pointer to a 256-bit integer vector containing integer values. 3229 /// \returns A 256-bit integer vector containing the moved values. 3230 static __inline __m256i __DEFAULT_FN_ATTRS 3231 _mm256_loadu_si256(__m256i_u const *__p) 3232 { 3233 struct __loadu_si256 { 3234 __m256i_u __v; 3235 } __attribute__((__packed__, __may_alias__)); 3236 return ((const struct __loadu_si256*)__p)->__v; 3237 } 3238 3239 /// Loads 256 bits of integer data from an unaligned memory location 3240 /// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3241 /// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3242 /// line boundary. 3243 /// 3244 /// \headerfile <x86intrin.h> 3245 /// 3246 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3247 /// 3248 /// \param __p 3249 /// A pointer to a 256-bit integer vector containing integer values. 3250 /// \returns A 256-bit integer vector containing the moved values. 3251 static __inline __m256i __DEFAULT_FN_ATTRS 3252 _mm256_lddqu_si256(__m256i_u const *__p) 3253 { 3254 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3255 } 3256 3257 /* SIMD store ops */ 3258 /// Stores double-precision floating point values from a 256-bit vector 3259 /// of [4 x double] to a 32-byte aligned memory location pointed to by 3260 /// \a __p. 3261 /// 3262 /// \headerfile <x86intrin.h> 3263 /// 3264 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3265 /// 3266 /// \param __p 3267 /// A 32-byte aligned pointer to a memory location that will receive the 3268 /// double-precision floaing point values. 3269 /// \param __a 3270 /// A 256-bit vector of [4 x double] containing the values to be moved. 3271 static __inline void __DEFAULT_FN_ATTRS 3272 _mm256_store_pd(double *__p, __m256d __a) 3273 { 3274 *(__m256d *)__p = __a; 3275 } 3276 3277 /// Stores single-precision floating point values from a 256-bit vector 3278 /// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3279 /// 3280 /// \headerfile <x86intrin.h> 3281 /// 3282 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3283 /// 3284 /// \param __p 3285 /// A 32-byte aligned pointer to a memory location that will receive the 3286 /// float values. 3287 /// \param __a 3288 /// A 256-bit vector of [8 x float] containing the values to be moved. 3289 static __inline void __DEFAULT_FN_ATTRS 3290 _mm256_store_ps(float *__p, __m256 __a) 3291 { 3292 *(__m256 *)__p = __a; 3293 } 3294 3295 /// Stores double-precision floating point values from a 256-bit vector 3296 /// of [4 x double] to an unaligned memory location pointed to by \a __p. 3297 /// 3298 /// \headerfile <x86intrin.h> 3299 /// 3300 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3301 /// 3302 /// \param __p 3303 /// A pointer to a memory location that will receive the double-precision 3304 /// floating point values. 3305 /// \param __a 3306 /// A 256-bit vector of [4 x double] containing the values to be moved. 3307 static __inline void __DEFAULT_FN_ATTRS 3308 _mm256_storeu_pd(double *__p, __m256d __a) 3309 { 3310 struct __storeu_pd { 3311 __m256d_u __v; 3312 } __attribute__((__packed__, __may_alias__)); 3313 ((struct __storeu_pd*)__p)->__v = __a; 3314 } 3315 3316 /// Stores single-precision floating point values from a 256-bit vector 3317 /// of [8 x float] to an unaligned memory location pointed to by \a __p. 3318 /// 3319 /// \headerfile <x86intrin.h> 3320 /// 3321 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3322 /// 3323 /// \param __p 3324 /// A pointer to a memory location that will receive the float values. 3325 /// \param __a 3326 /// A 256-bit vector of [8 x float] containing the values to be moved. 3327 static __inline void __DEFAULT_FN_ATTRS 3328 _mm256_storeu_ps(float *__p, __m256 __a) 3329 { 3330 struct __storeu_ps { 3331 __m256_u __v; 3332 } __attribute__((__packed__, __may_alias__)); 3333 ((struct __storeu_ps*)__p)->__v = __a; 3334 } 3335 3336 /// Stores integer values from a 256-bit integer vector to a 32-byte 3337 /// aligned memory location pointed to by \a __p. 3338 /// 3339 /// \headerfile <x86intrin.h> 3340 /// 3341 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3342 /// 3343 /// \param __p 3344 /// A 32-byte aligned pointer to a memory location that will receive the 3345 /// integer values. 3346 /// \param __a 3347 /// A 256-bit integer vector containing the values to be moved. 3348 static __inline void __DEFAULT_FN_ATTRS 3349 _mm256_store_si256(__m256i *__p, __m256i __a) 3350 { 3351 *__p = __a; 3352 } 3353 3354 /// Stores integer values from a 256-bit integer vector to an unaligned 3355 /// memory location pointed to by \a __p. 3356 /// 3357 /// \headerfile <x86intrin.h> 3358 /// 3359 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3360 /// 3361 /// \param __p 3362 /// A pointer to a memory location that will receive the integer values. 3363 /// \param __a 3364 /// A 256-bit integer vector containing the values to be moved. 3365 static __inline void __DEFAULT_FN_ATTRS 3366 _mm256_storeu_si256(__m256i_u *__p, __m256i __a) 3367 { 3368 struct __storeu_si256 { 3369 __m256i_u __v; 3370 } __attribute__((__packed__, __may_alias__)); 3371 ((struct __storeu_si256*)__p)->__v = __a; 3372 } 3373 3374 /* Conditional load ops */ 3375 /// Conditionally loads double-precision floating point elements from a 3376 /// memory location pointed to by \a __p into a 128-bit vector of 3377 /// [2 x double], depending on the mask bits associated with each data 3378 /// element. 3379 /// 3380 /// \headerfile <x86intrin.h> 3381 /// 3382 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3383 /// 3384 /// \param __p 3385 /// A pointer to a memory location that contains the double-precision 3386 /// floating point values. 3387 /// \param __m 3388 /// A 128-bit integer vector containing the mask. The most significant bit of 3389 /// each data element represents the mask bits. If a mask bit is zero, the 3390 /// corresponding value in the memory location is not loaded and the 3391 /// corresponding field in the return value is set to zero. 3392 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 3393 static __inline __m128d __DEFAULT_FN_ATTRS128 3394 _mm_maskload_pd(double const *__p, __m128i __m) 3395 { 3396 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3397 } 3398 3399 /// Conditionally loads double-precision floating point elements from a 3400 /// memory location pointed to by \a __p into a 256-bit vector of 3401 /// [4 x double], depending on the mask bits associated with each data 3402 /// element. 3403 /// 3404 /// \headerfile <x86intrin.h> 3405 /// 3406 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3407 /// 3408 /// \param __p 3409 /// A pointer to a memory location that contains the double-precision 3410 /// floating point values. 3411 /// \param __m 3412 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3413 /// significant bit of each quadword element represents the mask bits. If a 3414 /// mask bit is zero, the corresponding value in the memory location is not 3415 /// loaded and the corresponding field in the return value is set to zero. 3416 /// \returns A 256-bit vector of [4 x double] containing the loaded values. 3417 static __inline __m256d __DEFAULT_FN_ATTRS 3418 _mm256_maskload_pd(double const *__p, __m256i __m) 3419 { 3420 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3421 (__v4di)__m); 3422 } 3423 3424 /// Conditionally loads single-precision floating point elements from a 3425 /// memory location pointed to by \a __p into a 128-bit vector of 3426 /// [4 x float], depending on the mask bits associated with each data 3427 /// element. 3428 /// 3429 /// \headerfile <x86intrin.h> 3430 /// 3431 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3432 /// 3433 /// \param __p 3434 /// A pointer to a memory location that contains the single-precision 3435 /// floating point values. 3436 /// \param __m 3437 /// A 128-bit integer vector containing the mask. The most significant bit of 3438 /// each data element represents the mask bits. If a mask bit is zero, the 3439 /// corresponding value in the memory location is not loaded and the 3440 /// corresponding field in the return value is set to zero. 3441 /// \returns A 128-bit vector of [4 x float] containing the loaded values. 3442 static __inline __m128 __DEFAULT_FN_ATTRS128 3443 _mm_maskload_ps(float const *__p, __m128i __m) 3444 { 3445 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3446 } 3447 3448 /// Conditionally loads single-precision floating point elements from a 3449 /// memory location pointed to by \a __p into a 256-bit vector of 3450 /// [8 x float], depending on the mask bits associated with each data 3451 /// element. 3452 /// 3453 /// \headerfile <x86intrin.h> 3454 /// 3455 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3456 /// 3457 /// \param __p 3458 /// A pointer to a memory location that contains the single-precision 3459 /// floating point values. 3460 /// \param __m 3461 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3462 /// significant bit of each dword element represents the mask bits. If a mask 3463 /// bit is zero, the corresponding value in the memory location is not loaded 3464 /// and the corresponding field in the return value is set to zero. 3465 /// \returns A 256-bit vector of [8 x float] containing the loaded values. 3466 static __inline __m256 __DEFAULT_FN_ATTRS 3467 _mm256_maskload_ps(float const *__p, __m256i __m) 3468 { 3469 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3470 } 3471 3472 /* Conditional store ops */ 3473 /// Moves single-precision floating point values from a 256-bit vector 3474 /// of [8 x float] to a memory location pointed to by \a __p, according to 3475 /// the specified mask. 3476 /// 3477 /// \headerfile <x86intrin.h> 3478 /// 3479 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3480 /// 3481 /// \param __p 3482 /// A pointer to a memory location that will receive the float values. 3483 /// \param __m 3484 /// A 256-bit integer vector of [8 x dword] containing the mask. The most 3485 /// significant bit of each dword element in the mask vector represents the 3486 /// mask bits. If a mask bit is zero, the corresponding value from vector 3487 /// \a __a is not stored and the corresponding field in the memory location 3488 /// pointed to by \a __p is not changed. 3489 /// \param __a 3490 /// A 256-bit vector of [8 x float] containing the values to be stored. 3491 static __inline void __DEFAULT_FN_ATTRS 3492 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3493 { 3494 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3495 } 3496 3497 /// Moves double-precision values from a 128-bit vector of [2 x double] 3498 /// to a memory location pointed to by \a __p, according to the specified 3499 /// mask. 3500 /// 3501 /// \headerfile <x86intrin.h> 3502 /// 3503 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3504 /// 3505 /// \param __p 3506 /// A pointer to a memory location that will receive the float values. 3507 /// \param __m 3508 /// A 128-bit integer vector containing the mask. The most significant bit of 3509 /// each field in the mask vector represents the mask bits. If a mask bit is 3510 /// zero, the corresponding value from vector \a __a is not stored and the 3511 /// corresponding field in the memory location pointed to by \a __p is not 3512 /// changed. 3513 /// \param __a 3514 /// A 128-bit vector of [2 x double] containing the values to be stored. 3515 static __inline void __DEFAULT_FN_ATTRS128 3516 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3517 { 3518 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3519 } 3520 3521 /// Moves double-precision values from a 256-bit vector of [4 x double] 3522 /// to a memory location pointed to by \a __p, according to the specified 3523 /// mask. 3524 /// 3525 /// \headerfile <x86intrin.h> 3526 /// 3527 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3528 /// 3529 /// \param __p 3530 /// A pointer to a memory location that will receive the float values. 3531 /// \param __m 3532 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3533 /// significant bit of each quadword element in the mask vector represents 3534 /// the mask bits. If a mask bit is zero, the corresponding value from vector 3535 /// __a is not stored and the corresponding field in the memory location 3536 /// pointed to by \a __p is not changed. 3537 /// \param __a 3538 /// A 256-bit vector of [4 x double] containing the values to be stored. 3539 static __inline void __DEFAULT_FN_ATTRS 3540 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3541 { 3542 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3543 } 3544 3545 /// Moves single-precision floating point values from a 128-bit vector 3546 /// of [4 x float] to a memory location pointed to by \a __p, according to 3547 /// the specified mask. 3548 /// 3549 /// \headerfile <x86intrin.h> 3550 /// 3551 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3552 /// 3553 /// \param __p 3554 /// A pointer to a memory location that will receive the float values. 3555 /// \param __m 3556 /// A 128-bit integer vector containing the mask. The most significant bit of 3557 /// each field in the mask vector represents the mask bits. If a mask bit is 3558 /// zero, the corresponding value from vector __a is not stored and the 3559 /// corresponding field in the memory location pointed to by \a __p is not 3560 /// changed. 3561 /// \param __a 3562 /// A 128-bit vector of [4 x float] containing the values to be stored. 3563 static __inline void __DEFAULT_FN_ATTRS128 3564 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3565 { 3566 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3567 } 3568 3569 /* Cacheability support ops */ 3570 /// Moves integer data from a 256-bit integer vector to a 32-byte 3571 /// aligned memory location. To minimize caching, the data is flagged as 3572 /// non-temporal (unlikely to be used again soon). 3573 /// 3574 /// \headerfile <x86intrin.h> 3575 /// 3576 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3577 /// 3578 /// \param __a 3579 /// A pointer to a 32-byte aligned memory location that will receive the 3580 /// integer values. 3581 /// \param __b 3582 /// A 256-bit integer vector containing the values to be moved. 3583 static __inline void __DEFAULT_FN_ATTRS 3584 _mm256_stream_si256(void *__a, __m256i __b) 3585 { 3586 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 3587 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); 3588 } 3589 3590 /// Moves double-precision values from a 256-bit vector of [4 x double] 3591 /// to a 32-byte aligned memory location. To minimize caching, the data is 3592 /// flagged as non-temporal (unlikely to be used again soon). 3593 /// 3594 /// \headerfile <x86intrin.h> 3595 /// 3596 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3597 /// 3598 /// \param __a 3599 /// A pointer to a 32-byte aligned memory location that will receive the 3600 /// double-precision floating-point values. 3601 /// \param __b 3602 /// A 256-bit vector of [4 x double] containing the values to be moved. 3603 static __inline void __DEFAULT_FN_ATTRS 3604 _mm256_stream_pd(void *__a, __m256d __b) 3605 { 3606 typedef __v4df __v4df_aligned __attribute__((aligned(32))); 3607 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); 3608 } 3609 3610 /// Moves single-precision floating point values from a 256-bit vector 3611 /// of [8 x float] to a 32-byte aligned memory location. To minimize 3612 /// caching, the data is flagged as non-temporal (unlikely to be used again 3613 /// soon). 3614 /// 3615 /// \headerfile <x86intrin.h> 3616 /// 3617 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3618 /// 3619 /// \param __p 3620 /// A pointer to a 32-byte aligned memory location that will receive the 3621 /// single-precision floating point values. 3622 /// \param __a 3623 /// A 256-bit vector of [8 x float] containing the values to be moved. 3624 static __inline void __DEFAULT_FN_ATTRS 3625 _mm256_stream_ps(void *__p, __m256 __a) 3626 { 3627 typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); 3628 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); 3629 } 3630 3631 /* Create vectors */ 3632 /// Create a 256-bit vector of [4 x double] with undefined values. 3633 /// 3634 /// \headerfile <x86intrin.h> 3635 /// 3636 /// This intrinsic has no corresponding instruction. 3637 /// 3638 /// \returns A 256-bit vector of [4 x double] containing undefined values. 3639 static __inline__ __m256d __DEFAULT_FN_ATTRS 3640 _mm256_undefined_pd(void) 3641 { 3642 return (__m256d)__builtin_ia32_undef256(); 3643 } 3644 3645 /// Create a 256-bit vector of [8 x float] with undefined values. 3646 /// 3647 /// \headerfile <x86intrin.h> 3648 /// 3649 /// This intrinsic has no corresponding instruction. 3650 /// 3651 /// \returns A 256-bit vector of [8 x float] containing undefined values. 3652 static __inline__ __m256 __DEFAULT_FN_ATTRS 3653 _mm256_undefined_ps(void) 3654 { 3655 return (__m256)__builtin_ia32_undef256(); 3656 } 3657 3658 /// Create a 256-bit integer vector with undefined values. 3659 /// 3660 /// \headerfile <x86intrin.h> 3661 /// 3662 /// This intrinsic has no corresponding instruction. 3663 /// 3664 /// \returns A 256-bit integer vector containing undefined values. 3665 static __inline__ __m256i __DEFAULT_FN_ATTRS 3666 _mm256_undefined_si256(void) 3667 { 3668 return (__m256i)__builtin_ia32_undef256(); 3669 } 3670 3671 /// Constructs a 256-bit floating-point vector of [4 x double] 3672 /// initialized with the specified double-precision floating-point values. 3673 /// 3674 /// \headerfile <x86intrin.h> 3675 /// 3676 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3677 /// instruction. 3678 /// 3679 /// \param __a 3680 /// A double-precision floating-point value used to initialize bits [255:192] 3681 /// of the result. 3682 /// \param __b 3683 /// A double-precision floating-point value used to initialize bits [191:128] 3684 /// of the result. 3685 /// \param __c 3686 /// A double-precision floating-point value used to initialize bits [127:64] 3687 /// of the result. 3688 /// \param __d 3689 /// A double-precision floating-point value used to initialize bits [63:0] 3690 /// of the result. 3691 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3692 static __inline __m256d __DEFAULT_FN_ATTRS 3693 _mm256_set_pd(double __a, double __b, double __c, double __d) 3694 { 3695 return __extension__ (__m256d){ __d, __c, __b, __a }; 3696 } 3697 3698 /// Constructs a 256-bit floating-point vector of [8 x float] initialized 3699 /// with the specified single-precision floating-point values. 3700 /// 3701 /// \headerfile <x86intrin.h> 3702 /// 3703 /// This intrinsic is a utility function and does not correspond to a specific 3704 /// instruction. 3705 /// 3706 /// \param __a 3707 /// A single-precision floating-point value used to initialize bits [255:224] 3708 /// of the result. 3709 /// \param __b 3710 /// A single-precision floating-point value used to initialize bits [223:192] 3711 /// of the result. 3712 /// \param __c 3713 /// A single-precision floating-point value used to initialize bits [191:160] 3714 /// of the result. 3715 /// \param __d 3716 /// A single-precision floating-point value used to initialize bits [159:128] 3717 /// of the result. 3718 /// \param __e 3719 /// A single-precision floating-point value used to initialize bits [127:96] 3720 /// of the result. 3721 /// \param __f 3722 /// A single-precision floating-point value used to initialize bits [95:64] 3723 /// of the result. 3724 /// \param __g 3725 /// A single-precision floating-point value used to initialize bits [63:32] 3726 /// of the result. 3727 /// \param __h 3728 /// A single-precision floating-point value used to initialize bits [31:0] 3729 /// of the result. 3730 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 3731 static __inline __m256 __DEFAULT_FN_ATTRS 3732 _mm256_set_ps(float __a, float __b, float __c, float __d, 3733 float __e, float __f, float __g, float __h) 3734 { 3735 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3736 } 3737 3738 /// Constructs a 256-bit integer vector initialized with the specified 3739 /// 32-bit integral values. 3740 /// 3741 /// \headerfile <x86intrin.h> 3742 /// 3743 /// This intrinsic is a utility function and does not correspond to a specific 3744 /// instruction. 3745 /// 3746 /// \param __i0 3747 /// A 32-bit integral value used to initialize bits [255:224] of the result. 3748 /// \param __i1 3749 /// A 32-bit integral value used to initialize bits [223:192] of the result. 3750 /// \param __i2 3751 /// A 32-bit integral value used to initialize bits [191:160] of the result. 3752 /// \param __i3 3753 /// A 32-bit integral value used to initialize bits [159:128] of the result. 3754 /// \param __i4 3755 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3756 /// \param __i5 3757 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3758 /// \param __i6 3759 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3760 /// \param __i7 3761 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3762 /// \returns An initialized 256-bit integer vector. 3763 static __inline __m256i __DEFAULT_FN_ATTRS 3764 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3765 int __i4, int __i5, int __i6, int __i7) 3766 { 3767 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3768 } 3769 3770 /// Constructs a 256-bit integer vector initialized with the specified 3771 /// 16-bit integral values. 3772 /// 3773 /// \headerfile <x86intrin.h> 3774 /// 3775 /// This intrinsic is a utility function and does not correspond to a specific 3776 /// instruction. 3777 /// 3778 /// \param __w15 3779 /// A 16-bit integral value used to initialize bits [255:240] of the result. 3780 /// \param __w14 3781 /// A 16-bit integral value used to initialize bits [239:224] of the result. 3782 /// \param __w13 3783 /// A 16-bit integral value used to initialize bits [223:208] of the result. 3784 /// \param __w12 3785 /// A 16-bit integral value used to initialize bits [207:192] of the result. 3786 /// \param __w11 3787 /// A 16-bit integral value used to initialize bits [191:176] of the result. 3788 /// \param __w10 3789 /// A 16-bit integral value used to initialize bits [175:160] of the result. 3790 /// \param __w09 3791 /// A 16-bit integral value used to initialize bits [159:144] of the result. 3792 /// \param __w08 3793 /// A 16-bit integral value used to initialize bits [143:128] of the result. 3794 /// \param __w07 3795 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3796 /// \param __w06 3797 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3798 /// \param __w05 3799 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3800 /// \param __w04 3801 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3802 /// \param __w03 3803 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3804 /// \param __w02 3805 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3806 /// \param __w01 3807 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3808 /// \param __w00 3809 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3810 /// \returns An initialized 256-bit integer vector. 3811 static __inline __m256i __DEFAULT_FN_ATTRS 3812 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3813 short __w11, short __w10, short __w09, short __w08, 3814 short __w07, short __w06, short __w05, short __w04, 3815 short __w03, short __w02, short __w01, short __w00) 3816 { 3817 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3818 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3819 } 3820 3821 /// Constructs a 256-bit integer vector initialized with the specified 3822 /// 8-bit integral values. 3823 /// 3824 /// \headerfile <x86intrin.h> 3825 /// 3826 /// This intrinsic is a utility function and does not correspond to a specific 3827 /// instruction. 3828 /// 3829 /// \param __b31 3830 /// An 8-bit integral value used to initialize bits [255:248] of the result. 3831 /// \param __b30 3832 /// An 8-bit integral value used to initialize bits [247:240] of the result. 3833 /// \param __b29 3834 /// An 8-bit integral value used to initialize bits [239:232] of the result. 3835 /// \param __b28 3836 /// An 8-bit integral value used to initialize bits [231:224] of the result. 3837 /// \param __b27 3838 /// An 8-bit integral value used to initialize bits [223:216] of the result. 3839 /// \param __b26 3840 /// An 8-bit integral value used to initialize bits [215:208] of the result. 3841 /// \param __b25 3842 /// An 8-bit integral value used to initialize bits [207:200] of the result. 3843 /// \param __b24 3844 /// An 8-bit integral value used to initialize bits [199:192] of the result. 3845 /// \param __b23 3846 /// An 8-bit integral value used to initialize bits [191:184] of the result. 3847 /// \param __b22 3848 /// An 8-bit integral value used to initialize bits [183:176] of the result. 3849 /// \param __b21 3850 /// An 8-bit integral value used to initialize bits [175:168] of the result. 3851 /// \param __b20 3852 /// An 8-bit integral value used to initialize bits [167:160] of the result. 3853 /// \param __b19 3854 /// An 8-bit integral value used to initialize bits [159:152] of the result. 3855 /// \param __b18 3856 /// An 8-bit integral value used to initialize bits [151:144] of the result. 3857 /// \param __b17 3858 /// An 8-bit integral value used to initialize bits [143:136] of the result. 3859 /// \param __b16 3860 /// An 8-bit integral value used to initialize bits [135:128] of the result. 3861 /// \param __b15 3862 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3863 /// \param __b14 3864 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3865 /// \param __b13 3866 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3867 /// \param __b12 3868 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3869 /// \param __b11 3870 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3871 /// \param __b10 3872 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3873 /// \param __b09 3874 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3875 /// \param __b08 3876 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3877 /// \param __b07 3878 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3879 /// \param __b06 3880 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3881 /// \param __b05 3882 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3883 /// \param __b04 3884 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3885 /// \param __b03 3886 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3887 /// \param __b02 3888 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3889 /// \param __b01 3890 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3891 /// \param __b00 3892 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3893 /// \returns An initialized 256-bit integer vector. 3894 static __inline __m256i __DEFAULT_FN_ATTRS 3895 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3896 char __b27, char __b26, char __b25, char __b24, 3897 char __b23, char __b22, char __b21, char __b20, 3898 char __b19, char __b18, char __b17, char __b16, 3899 char __b15, char __b14, char __b13, char __b12, 3900 char __b11, char __b10, char __b09, char __b08, 3901 char __b07, char __b06, char __b05, char __b04, 3902 char __b03, char __b02, char __b01, char __b00) 3903 { 3904 return __extension__ (__m256i)(__v32qi){ 3905 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3906 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3907 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3908 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3909 }; 3910 } 3911 3912 /// Constructs a 256-bit integer vector initialized with the specified 3913 /// 64-bit integral values. 3914 /// 3915 /// \headerfile <x86intrin.h> 3916 /// 3917 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3918 /// instruction. 3919 /// 3920 /// \param __a 3921 /// A 64-bit integral value used to initialize bits [255:192] of the result. 3922 /// \param __b 3923 /// A 64-bit integral value used to initialize bits [191:128] of the result. 3924 /// \param __c 3925 /// A 64-bit integral value used to initialize bits [127:64] of the result. 3926 /// \param __d 3927 /// A 64-bit integral value used to initialize bits [63:0] of the result. 3928 /// \returns An initialized 256-bit integer vector. 3929 static __inline __m256i __DEFAULT_FN_ATTRS 3930 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3931 { 3932 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; 3933 } 3934 3935 /* Create vectors with elements in reverse order */ 3936 /// Constructs a 256-bit floating-point vector of [4 x double], 3937 /// initialized in reverse order with the specified double-precision 3938 /// floating-point values. 3939 /// 3940 /// \headerfile <x86intrin.h> 3941 /// 3942 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3943 /// instruction. 3944 /// 3945 /// \param __a 3946 /// A double-precision floating-point value used to initialize bits [63:0] 3947 /// of the result. 3948 /// \param __b 3949 /// A double-precision floating-point value used to initialize bits [127:64] 3950 /// of the result. 3951 /// \param __c 3952 /// A double-precision floating-point value used to initialize bits [191:128] 3953 /// of the result. 3954 /// \param __d 3955 /// A double-precision floating-point value used to initialize bits [255:192] 3956 /// of the result. 3957 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 3958 static __inline __m256d __DEFAULT_FN_ATTRS 3959 _mm256_setr_pd(double __a, double __b, double __c, double __d) 3960 { 3961 return _mm256_set_pd(__d, __c, __b, __a); 3962 } 3963 3964 /// Constructs a 256-bit floating-point vector of [8 x float], 3965 /// initialized in reverse order with the specified single-precision 3966 /// float-point values. 3967 /// 3968 /// \headerfile <x86intrin.h> 3969 /// 3970 /// This intrinsic is a utility function and does not correspond to a specific 3971 /// instruction. 3972 /// 3973 /// \param __a 3974 /// A single-precision floating-point value used to initialize bits [31:0] 3975 /// of the result. 3976 /// \param __b 3977 /// A single-precision floating-point value used to initialize bits [63:32] 3978 /// of the result. 3979 /// \param __c 3980 /// A single-precision floating-point value used to initialize bits [95:64] 3981 /// of the result. 3982 /// \param __d 3983 /// A single-precision floating-point value used to initialize bits [127:96] 3984 /// of the result. 3985 /// \param __e 3986 /// A single-precision floating-point value used to initialize bits [159:128] 3987 /// of the result. 3988 /// \param __f 3989 /// A single-precision floating-point value used to initialize bits [191:160] 3990 /// of the result. 3991 /// \param __g 3992 /// A single-precision floating-point value used to initialize bits [223:192] 3993 /// of the result. 3994 /// \param __h 3995 /// A single-precision floating-point value used to initialize bits [255:224] 3996 /// of the result. 3997 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 3998 static __inline __m256 __DEFAULT_FN_ATTRS 3999 _mm256_setr_ps(float __a, float __b, float __c, float __d, 4000 float __e, float __f, float __g, float __h) 4001 { 4002 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); 4003 } 4004 4005 /// Constructs a 256-bit integer vector, initialized in reverse order 4006 /// with the specified 32-bit integral values. 4007 /// 4008 /// \headerfile <x86intrin.h> 4009 /// 4010 /// This intrinsic is a utility function and does not correspond to a specific 4011 /// instruction. 4012 /// 4013 /// \param __i0 4014 /// A 32-bit integral value used to initialize bits [31:0] of the result. 4015 /// \param __i1 4016 /// A 32-bit integral value used to initialize bits [63:32] of the result. 4017 /// \param __i2 4018 /// A 32-bit integral value used to initialize bits [95:64] of the result. 4019 /// \param __i3 4020 /// A 32-bit integral value used to initialize bits [127:96] of the result. 4021 /// \param __i4 4022 /// A 32-bit integral value used to initialize bits [159:128] of the result. 4023 /// \param __i5 4024 /// A 32-bit integral value used to initialize bits [191:160] of the result. 4025 /// \param __i6 4026 /// A 32-bit integral value used to initialize bits [223:192] of the result. 4027 /// \param __i7 4028 /// A 32-bit integral value used to initialize bits [255:224] of the result. 4029 /// \returns An initialized 256-bit integer vector. 4030 static __inline __m256i __DEFAULT_FN_ATTRS 4031 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 4032 int __i4, int __i5, int __i6, int __i7) 4033 { 4034 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); 4035 } 4036 4037 /// Constructs a 256-bit integer vector, initialized in reverse order 4038 /// with the specified 16-bit integral values. 4039 /// 4040 /// \headerfile <x86intrin.h> 4041 /// 4042 /// This intrinsic is a utility function and does not correspond to a specific 4043 /// instruction. 4044 /// 4045 /// \param __w15 4046 /// A 16-bit integral value used to initialize bits [15:0] of the result. 4047 /// \param __w14 4048 /// A 16-bit integral value used to initialize bits [31:16] of the result. 4049 /// \param __w13 4050 /// A 16-bit integral value used to initialize bits [47:32] of the result. 4051 /// \param __w12 4052 /// A 16-bit integral value used to initialize bits [63:48] of the result. 4053 /// \param __w11 4054 /// A 16-bit integral value used to initialize bits [79:64] of the result. 4055 /// \param __w10 4056 /// A 16-bit integral value used to initialize bits [95:80] of the result. 4057 /// \param __w09 4058 /// A 16-bit integral value used to initialize bits [111:96] of the result. 4059 /// \param __w08 4060 /// A 16-bit integral value used to initialize bits [127:112] of the result. 4061 /// \param __w07 4062 /// A 16-bit integral value used to initialize bits [143:128] of the result. 4063 /// \param __w06 4064 /// A 16-bit integral value used to initialize bits [159:144] of the result. 4065 /// \param __w05 4066 /// A 16-bit integral value used to initialize bits [175:160] of the result. 4067 /// \param __w04 4068 /// A 16-bit integral value used to initialize bits [191:176] of the result. 4069 /// \param __w03 4070 /// A 16-bit integral value used to initialize bits [207:192] of the result. 4071 /// \param __w02 4072 /// A 16-bit integral value used to initialize bits [223:208] of the result. 4073 /// \param __w01 4074 /// A 16-bit integral value used to initialize bits [239:224] of the result. 4075 /// \param __w00 4076 /// A 16-bit integral value used to initialize bits [255:240] of the result. 4077 /// \returns An initialized 256-bit integer vector. 4078 static __inline __m256i __DEFAULT_FN_ATTRS 4079 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4080 short __w11, short __w10, short __w09, short __w08, 4081 short __w07, short __w06, short __w05, short __w04, 4082 short __w03, short __w02, short __w01, short __w00) 4083 { 4084 return _mm256_set_epi16(__w00, __w01, __w02, __w03, 4085 __w04, __w05, __w06, __w07, 4086 __w08, __w09, __w10, __w11, 4087 __w12, __w13, __w14, __w15); 4088 } 4089 4090 /// Constructs a 256-bit integer vector, initialized in reverse order 4091 /// with the specified 8-bit integral values. 4092 /// 4093 /// \headerfile <x86intrin.h> 4094 /// 4095 /// This intrinsic is a utility function and does not correspond to a specific 4096 /// instruction. 4097 /// 4098 /// \param __b31 4099 /// An 8-bit integral value used to initialize bits [7:0] of the result. 4100 /// \param __b30 4101 /// An 8-bit integral value used to initialize bits [15:8] of the result. 4102 /// \param __b29 4103 /// An 8-bit integral value used to initialize bits [23:16] of the result. 4104 /// \param __b28 4105 /// An 8-bit integral value used to initialize bits [31:24] of the result. 4106 /// \param __b27 4107 /// An 8-bit integral value used to initialize bits [39:32] of the result. 4108 /// \param __b26 4109 /// An 8-bit integral value used to initialize bits [47:40] of the result. 4110 /// \param __b25 4111 /// An 8-bit integral value used to initialize bits [55:48] of the result. 4112 /// \param __b24 4113 /// An 8-bit integral value used to initialize bits [63:56] of the result. 4114 /// \param __b23 4115 /// An 8-bit integral value used to initialize bits [71:64] of the result. 4116 /// \param __b22 4117 /// An 8-bit integral value used to initialize bits [79:72] of the result. 4118 /// \param __b21 4119 /// An 8-bit integral value used to initialize bits [87:80] of the result. 4120 /// \param __b20 4121 /// An 8-bit integral value used to initialize bits [95:88] of the result. 4122 /// \param __b19 4123 /// An 8-bit integral value used to initialize bits [103:96] of the result. 4124 /// \param __b18 4125 /// An 8-bit integral value used to initialize bits [111:104] of the result. 4126 /// \param __b17 4127 /// An 8-bit integral value used to initialize bits [119:112] of the result. 4128 /// \param __b16 4129 /// An 8-bit integral value used to initialize bits [127:120] of the result. 4130 /// \param __b15 4131 /// An 8-bit integral value used to initialize bits [135:128] of the result. 4132 /// \param __b14 4133 /// An 8-bit integral value used to initialize bits [143:136] of the result. 4134 /// \param __b13 4135 /// An 8-bit integral value used to initialize bits [151:144] of the result. 4136 /// \param __b12 4137 /// An 8-bit integral value used to initialize bits [159:152] of the result. 4138 /// \param __b11 4139 /// An 8-bit integral value used to initialize bits [167:160] of the result. 4140 /// \param __b10 4141 /// An 8-bit integral value used to initialize bits [175:168] of the result. 4142 /// \param __b09 4143 /// An 8-bit integral value used to initialize bits [183:176] of the result. 4144 /// \param __b08 4145 /// An 8-bit integral value used to initialize bits [191:184] of the result. 4146 /// \param __b07 4147 /// An 8-bit integral value used to initialize bits [199:192] of the result. 4148 /// \param __b06 4149 /// An 8-bit integral value used to initialize bits [207:200] of the result. 4150 /// \param __b05 4151 /// An 8-bit integral value used to initialize bits [215:208] of the result. 4152 /// \param __b04 4153 /// An 8-bit integral value used to initialize bits [223:216] of the result. 4154 /// \param __b03 4155 /// An 8-bit integral value used to initialize bits [231:224] of the result. 4156 /// \param __b02 4157 /// An 8-bit integral value used to initialize bits [239:232] of the result. 4158 /// \param __b01 4159 /// An 8-bit integral value used to initialize bits [247:240] of the result. 4160 /// \param __b00 4161 /// An 8-bit integral value used to initialize bits [255:248] of the result. 4162 /// \returns An initialized 256-bit integer vector. 4163 static __inline __m256i __DEFAULT_FN_ATTRS 4164 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4165 char __b27, char __b26, char __b25, char __b24, 4166 char __b23, char __b22, char __b21, char __b20, 4167 char __b19, char __b18, char __b17, char __b16, 4168 char __b15, char __b14, char __b13, char __b12, 4169 char __b11, char __b10, char __b09, char __b08, 4170 char __b07, char __b06, char __b05, char __b04, 4171 char __b03, char __b02, char __b01, char __b00) 4172 { 4173 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 4174 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 4175 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 4176 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31); 4177 } 4178 4179 /// Constructs a 256-bit integer vector, initialized in reverse order 4180 /// with the specified 64-bit integral values. 4181 /// 4182 /// \headerfile <x86intrin.h> 4183 /// 4184 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4185 /// instruction. 4186 /// 4187 /// \param __a 4188 /// A 64-bit integral value used to initialize bits [63:0] of the result. 4189 /// \param __b 4190 /// A 64-bit integral value used to initialize bits [127:64] of the result. 4191 /// \param __c 4192 /// A 64-bit integral value used to initialize bits [191:128] of the result. 4193 /// \param __d 4194 /// A 64-bit integral value used to initialize bits [255:192] of the result. 4195 /// \returns An initialized 256-bit integer vector. 4196 static __inline __m256i __DEFAULT_FN_ATTRS 4197 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4198 { 4199 return _mm256_set_epi64x(__d, __c, __b, __a); 4200 } 4201 4202 /* Create vectors with repeated elements */ 4203 /// Constructs a 256-bit floating-point vector of [4 x double], with each 4204 /// of the four double-precision floating-point vector elements set to the 4205 /// specified double-precision floating-point value. 4206 /// 4207 /// \headerfile <x86intrin.h> 4208 /// 4209 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4210 /// 4211 /// \param __w 4212 /// A double-precision floating-point value used to initialize each vector 4213 /// element of the result. 4214 /// \returns An initialized 256-bit floating-point vector of [4 x double]. 4215 static __inline __m256d __DEFAULT_FN_ATTRS 4216 _mm256_set1_pd(double __w) 4217 { 4218 return _mm256_set_pd(__w, __w, __w, __w); 4219 } 4220 4221 /// Constructs a 256-bit floating-point vector of [8 x float], with each 4222 /// of the eight single-precision floating-point vector elements set to the 4223 /// specified single-precision floating-point value. 4224 /// 4225 /// \headerfile <x86intrin.h> 4226 /// 4227 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4228 /// instruction. 4229 /// 4230 /// \param __w 4231 /// A single-precision floating-point value used to initialize each vector 4232 /// element of the result. 4233 /// \returns An initialized 256-bit floating-point vector of [8 x float]. 4234 static __inline __m256 __DEFAULT_FN_ATTRS 4235 _mm256_set1_ps(float __w) 4236 { 4237 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); 4238 } 4239 4240 /// Constructs a 256-bit integer vector of [8 x i32], with each of the 4241 /// 32-bit integral vector elements set to the specified 32-bit integral 4242 /// value. 4243 /// 4244 /// \headerfile <x86intrin.h> 4245 /// 4246 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4247 /// instruction. 4248 /// 4249 /// \param __i 4250 /// A 32-bit integral value used to initialize each vector element of the 4251 /// result. 4252 /// \returns An initialized 256-bit integer vector of [8 x i32]. 4253 static __inline __m256i __DEFAULT_FN_ATTRS 4254 _mm256_set1_epi32(int __i) 4255 { 4256 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); 4257 } 4258 4259 /// Constructs a 256-bit integer vector of [16 x i16], with each of the 4260 /// 16-bit integral vector elements set to the specified 16-bit integral 4261 /// value. 4262 /// 4263 /// \headerfile <x86intrin.h> 4264 /// 4265 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4266 /// 4267 /// \param __w 4268 /// A 16-bit integral value used to initialize each vector element of the 4269 /// result. 4270 /// \returns An initialized 256-bit integer vector of [16 x i16]. 4271 static __inline __m256i __DEFAULT_FN_ATTRS 4272 _mm256_set1_epi16(short __w) 4273 { 4274 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, 4275 __w, __w, __w, __w, __w, __w, __w, __w); 4276 } 4277 4278 /// Constructs a 256-bit integer vector of [32 x i8], with each of the 4279 /// 8-bit integral vector elements set to the specified 8-bit integral value. 4280 /// 4281 /// \headerfile <x86intrin.h> 4282 /// 4283 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4284 /// 4285 /// \param __b 4286 /// An 8-bit integral value used to initialize each vector element of the 4287 /// result. 4288 /// \returns An initialized 256-bit integer vector of [32 x i8]. 4289 static __inline __m256i __DEFAULT_FN_ATTRS 4290 _mm256_set1_epi8(char __b) 4291 { 4292 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, 4293 __b, __b, __b, __b, __b, __b, __b, __b, 4294 __b, __b, __b, __b, __b, __b, __b, __b, 4295 __b, __b, __b, __b, __b, __b, __b, __b); 4296 } 4297 4298 /// Constructs a 256-bit integer vector of [4 x i64], with each of the 4299 /// 64-bit integral vector elements set to the specified 64-bit integral 4300 /// value. 4301 /// 4302 /// \headerfile <x86intrin.h> 4303 /// 4304 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4305 /// 4306 /// \param __q 4307 /// A 64-bit integral value used to initialize each vector element of the 4308 /// result. 4309 /// \returns An initialized 256-bit integer vector of [4 x i64]. 4310 static __inline __m256i __DEFAULT_FN_ATTRS 4311 _mm256_set1_epi64x(long long __q) 4312 { 4313 return _mm256_set_epi64x(__q, __q, __q, __q); 4314 } 4315 4316 /* Create __zeroed vectors */ 4317 /// Constructs a 256-bit floating-point vector of [4 x double] with all 4318 /// vector elements initialized to zero. 4319 /// 4320 /// \headerfile <x86intrin.h> 4321 /// 4322 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4323 /// 4324 /// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4325 static __inline __m256d __DEFAULT_FN_ATTRS 4326 _mm256_setzero_pd(void) 4327 { 4328 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; 4329 } 4330 4331 /// Constructs a 256-bit floating-point vector of [8 x float] with all 4332 /// vector elements initialized to zero. 4333 /// 4334 /// \headerfile <x86intrin.h> 4335 /// 4336 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4337 /// 4338 /// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4339 static __inline __m256 __DEFAULT_FN_ATTRS 4340 _mm256_setzero_ps(void) 4341 { 4342 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; 4343 } 4344 4345 /// Constructs a 256-bit integer vector initialized to zero. 4346 /// 4347 /// \headerfile <x86intrin.h> 4348 /// 4349 /// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4350 /// 4351 /// \returns A 256-bit integer vector initialized to zero. 4352 static __inline __m256i __DEFAULT_FN_ATTRS 4353 _mm256_setzero_si256(void) 4354 { 4355 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 4356 } 4357 4358 /* Cast between vector types */ 4359 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4360 /// floating-point vector of [8 x float]. 4361 /// 4362 /// \headerfile <x86intrin.h> 4363 /// 4364 /// This intrinsic has no corresponding instruction. 4365 /// 4366 /// \param __a 4367 /// A 256-bit floating-point vector of [4 x double]. 4368 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4369 /// bitwise pattern as the parameter. 4370 static __inline __m256 __DEFAULT_FN_ATTRS 4371 _mm256_castpd_ps(__m256d __a) 4372 { 4373 return (__m256)__a; 4374 } 4375 4376 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4377 /// integer vector. 4378 /// 4379 /// \headerfile <x86intrin.h> 4380 /// 4381 /// This intrinsic has no corresponding instruction. 4382 /// 4383 /// \param __a 4384 /// A 256-bit floating-point vector of [4 x double]. 4385 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4386 /// parameter. 4387 static __inline __m256i __DEFAULT_FN_ATTRS 4388 _mm256_castpd_si256(__m256d __a) 4389 { 4390 return (__m256i)__a; 4391 } 4392 4393 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4394 /// floating-point vector of [4 x double]. 4395 /// 4396 /// \headerfile <x86intrin.h> 4397 /// 4398 /// This intrinsic has no corresponding instruction. 4399 /// 4400 /// \param __a 4401 /// A 256-bit floating-point vector of [8 x float]. 4402 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4403 /// bitwise pattern as the parameter. 4404 static __inline __m256d __DEFAULT_FN_ATTRS 4405 _mm256_castps_pd(__m256 __a) 4406 { 4407 return (__m256d)__a; 4408 } 4409 4410 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4411 /// integer vector. 4412 /// 4413 /// \headerfile <x86intrin.h> 4414 /// 4415 /// This intrinsic has no corresponding instruction. 4416 /// 4417 /// \param __a 4418 /// A 256-bit floating-point vector of [8 x float]. 4419 /// \returns A 256-bit integer vector containing the same bitwise pattern as the 4420 /// parameter. 4421 static __inline __m256i __DEFAULT_FN_ATTRS 4422 _mm256_castps_si256(__m256 __a) 4423 { 4424 return (__m256i)__a; 4425 } 4426 4427 /// Casts a 256-bit integer vector into a 256-bit floating-point vector 4428 /// of [8 x float]. 4429 /// 4430 /// \headerfile <x86intrin.h> 4431 /// 4432 /// This intrinsic has no corresponding instruction. 4433 /// 4434 /// \param __a 4435 /// A 256-bit integer vector. 4436 /// \returns A 256-bit floating-point vector of [8 x float] containing the same 4437 /// bitwise pattern as the parameter. 4438 static __inline __m256 __DEFAULT_FN_ATTRS 4439 _mm256_castsi256_ps(__m256i __a) 4440 { 4441 return (__m256)__a; 4442 } 4443 4444 /// Casts a 256-bit integer vector into a 256-bit floating-point vector 4445 /// of [4 x double]. 4446 /// 4447 /// \headerfile <x86intrin.h> 4448 /// 4449 /// This intrinsic has no corresponding instruction. 4450 /// 4451 /// \param __a 4452 /// A 256-bit integer vector. 4453 /// \returns A 256-bit floating-point vector of [4 x double] containing the same 4454 /// bitwise pattern as the parameter. 4455 static __inline __m256d __DEFAULT_FN_ATTRS 4456 _mm256_castsi256_pd(__m256i __a) 4457 { 4458 return (__m256d)__a; 4459 } 4460 4461 /// Returns the lower 128 bits of a 256-bit floating-point vector of 4462 /// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4463 /// 4464 /// \headerfile <x86intrin.h> 4465 /// 4466 /// This intrinsic has no corresponding instruction. 4467 /// 4468 /// \param __a 4469 /// A 256-bit floating-point vector of [4 x double]. 4470 /// \returns A 128-bit floating-point vector of [2 x double] containing the 4471 /// lower 128 bits of the parameter. 4472 static __inline __m128d __DEFAULT_FN_ATTRS 4473 _mm256_castpd256_pd128(__m256d __a) 4474 { 4475 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4476 } 4477 4478 /// Returns the lower 128 bits of a 256-bit floating-point vector of 4479 /// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4480 /// 4481 /// \headerfile <x86intrin.h> 4482 /// 4483 /// This intrinsic has no corresponding instruction. 4484 /// 4485 /// \param __a 4486 /// A 256-bit floating-point vector of [8 x float]. 4487 /// \returns A 128-bit floating-point vector of [4 x float] containing the 4488 /// lower 128 bits of the parameter. 4489 static __inline __m128 __DEFAULT_FN_ATTRS 4490 _mm256_castps256_ps128(__m256 __a) 4491 { 4492 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4493 } 4494 4495 /// Truncates a 256-bit integer vector into a 128-bit integer vector. 4496 /// 4497 /// \headerfile <x86intrin.h> 4498 /// 4499 /// This intrinsic has no corresponding instruction. 4500 /// 4501 /// \param __a 4502 /// A 256-bit integer vector. 4503 /// \returns A 128-bit integer vector containing the lower 128 bits of the 4504 /// parameter. 4505 static __inline __m128i __DEFAULT_FN_ATTRS 4506 _mm256_castsi256_si128(__m256i __a) 4507 { 4508 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4509 } 4510 4511 /// Constructs a 256-bit floating-point vector of [4 x double] from a 4512 /// 128-bit floating-point vector of [2 x double]. 4513 /// 4514 /// The lower 128 bits contain the value of the source vector. The contents 4515 /// of the upper 128 bits are undefined. 4516 /// 4517 /// \headerfile <x86intrin.h> 4518 /// 4519 /// This intrinsic has no corresponding instruction. 4520 /// 4521 /// \param __a 4522 /// A 128-bit vector of [2 x double]. 4523 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4524 /// contain the value of the parameter. The contents of the upper 128 bits 4525 /// are undefined. 4526 static __inline __m256d __DEFAULT_FN_ATTRS 4527 _mm256_castpd128_pd256(__m128d __a) 4528 { 4529 return __builtin_shufflevector( 4530 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3); 4531 } 4532 4533 /// Constructs a 256-bit floating-point vector of [8 x float] from a 4534 /// 128-bit floating-point vector of [4 x float]. 4535 /// 4536 /// The lower 128 bits contain the value of the source vector. The contents 4537 /// of the upper 128 bits are undefined. 4538 /// 4539 /// \headerfile <x86intrin.h> 4540 /// 4541 /// This intrinsic has no corresponding instruction. 4542 /// 4543 /// \param __a 4544 /// A 128-bit vector of [4 x float]. 4545 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4546 /// contain the value of the parameter. The contents of the upper 128 bits 4547 /// are undefined. 4548 static __inline __m256 __DEFAULT_FN_ATTRS 4549 _mm256_castps128_ps256(__m128 __a) 4550 { 4551 return __builtin_shufflevector((__v4sf)__a, 4552 (__v4sf)__builtin_nondeterministic_value(__a), 4553 0, 1, 2, 3, 4, 5, 6, 7); 4554 } 4555 4556 /// Constructs a 256-bit integer vector from a 128-bit integer vector. 4557 /// 4558 /// The lower 128 bits contain the value of the source vector. The contents 4559 /// of the upper 128 bits are undefined. 4560 /// 4561 /// \headerfile <x86intrin.h> 4562 /// 4563 /// This intrinsic has no corresponding instruction. 4564 /// 4565 /// \param __a 4566 /// A 128-bit integer vector. 4567 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4568 /// the parameter. The contents of the upper 128 bits are undefined. 4569 static __inline __m256i __DEFAULT_FN_ATTRS 4570 _mm256_castsi128_si256(__m128i __a) 4571 { 4572 return __builtin_shufflevector( 4573 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3); 4574 } 4575 4576 /// Constructs a 256-bit floating-point vector of [4 x double] from a 4577 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4578 /// contain the value of the source vector. The upper 128 bits are set 4579 /// to zero. 4580 /// 4581 /// \headerfile <x86intrin.h> 4582 /// 4583 /// This intrinsic has no corresponding instruction. 4584 /// 4585 /// \param __a 4586 /// A 128-bit vector of [2 x double]. 4587 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4588 /// contain the value of the parameter. The upper 128 bits are set to zero. 4589 static __inline __m256d __DEFAULT_FN_ATTRS 4590 _mm256_zextpd128_pd256(__m128d __a) 4591 { 4592 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); 4593 } 4594 4595 /// Constructs a 256-bit floating-point vector of [8 x float] from a 4596 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4597 /// the value of the source vector. The upper 128 bits are set to zero. 4598 /// 4599 /// \headerfile <x86intrin.h> 4600 /// 4601 /// This intrinsic has no corresponding instruction. 4602 /// 4603 /// \param __a 4604 /// A 128-bit vector of [4 x float]. 4605 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4606 /// contain the value of the parameter. The upper 128 bits are set to zero. 4607 static __inline __m256 __DEFAULT_FN_ATTRS 4608 _mm256_zextps128_ps256(__m128 __a) 4609 { 4610 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); 4611 } 4612 4613 /// Constructs a 256-bit integer vector from a 128-bit integer vector. 4614 /// The lower 128 bits contain the value of the source vector. The upper 4615 /// 128 bits are set to zero. 4616 /// 4617 /// \headerfile <x86intrin.h> 4618 /// 4619 /// This intrinsic has no corresponding instruction. 4620 /// 4621 /// \param __a 4622 /// A 128-bit integer vector. 4623 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4624 /// the parameter. The upper 128 bits are set to zero. 4625 static __inline __m256i __DEFAULT_FN_ATTRS 4626 _mm256_zextsi128_si256(__m128i __a) 4627 { 4628 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); 4629 } 4630 4631 /* 4632 Vector insert. 4633 We use macros rather than inlines because we only want to accept 4634 invocations where the immediate M is a constant expression. 4635 */ 4636 /// Constructs a new 256-bit vector of [8 x float] by first duplicating 4637 /// a 256-bit vector of [8 x float] given in the first parameter, and then 4638 /// replacing either the upper or the lower 128 bits with the contents of a 4639 /// 128-bit vector of [4 x float] in the second parameter. 4640 /// 4641 /// The immediate integer parameter determines between the upper or the lower 4642 /// 128 bits. 4643 /// 4644 /// \headerfile <x86intrin.h> 4645 /// 4646 /// \code 4647 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4648 /// \endcode 4649 /// 4650 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4651 /// 4652 /// \param V1 4653 /// A 256-bit vector of [8 x float]. This vector is copied to the result 4654 /// first, and then either the upper or the lower 128 bits of the result will 4655 /// be replaced by the contents of \a V2. 4656 /// \param V2 4657 /// A 128-bit vector of [4 x float]. The contents of this parameter are 4658 /// written to either the upper or the lower 128 bits of the result depending 4659 /// on the value of parameter \a M. 4660 /// \param M 4661 /// An immediate integer. The least significant bit determines how the values 4662 /// from the two parameters are interleaved: \n 4663 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4664 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4665 /// result. \n 4666 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4667 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4668 /// result. 4669 /// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4670 #define _mm256_insertf128_ps(V1, V2, M) \ 4671 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ 4672 (__v4sf)(__m128)(V2), (int)(M))) 4673 4674 /// Constructs a new 256-bit vector of [4 x double] by first duplicating 4675 /// a 256-bit vector of [4 x double] given in the first parameter, and then 4676 /// replacing either the upper or the lower 128 bits with the contents of a 4677 /// 128-bit vector of [2 x double] in the second parameter. 4678 /// 4679 /// The immediate integer parameter determines between the upper or the lower 4680 /// 128 bits. 4681 /// 4682 /// \headerfile <x86intrin.h> 4683 /// 4684 /// \code 4685 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4686 /// \endcode 4687 /// 4688 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4689 /// 4690 /// \param V1 4691 /// A 256-bit vector of [4 x double]. This vector is copied to the result 4692 /// first, and then either the upper or the lower 128 bits of the result will 4693 /// be replaced by the contents of \a V2. 4694 /// \param V2 4695 /// A 128-bit vector of [2 x double]. The contents of this parameter are 4696 /// written to either the upper or the lower 128 bits of the result depending 4697 /// on the value of parameter \a M. 4698 /// \param M 4699 /// An immediate integer. The least significant bit determines how the values 4700 /// from the two parameters are interleaved: \n 4701 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4702 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4703 /// result. \n 4704 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4705 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4706 /// result. 4707 /// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4708 #define _mm256_insertf128_pd(V1, V2, M) \ 4709 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ 4710 (__v2df)(__m128d)(V2), (int)(M))) 4711 4712 /// Constructs a new 256-bit integer vector by first duplicating a 4713 /// 256-bit integer vector given in the first parameter, and then replacing 4714 /// either the upper or the lower 128 bits with the contents of a 128-bit 4715 /// integer vector in the second parameter. 4716 /// 4717 /// The immediate integer parameter determines between the upper or the lower 4718 /// 128 bits. 4719 /// 4720 /// \headerfile <x86intrin.h> 4721 /// 4722 /// \code 4723 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4724 /// \endcode 4725 /// 4726 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4727 /// 4728 /// \param V1 4729 /// A 256-bit integer vector. This vector is copied to the result first, and 4730 /// then either the upper or the lower 128 bits of the result will be 4731 /// replaced by the contents of \a V2. 4732 /// \param V2 4733 /// A 128-bit integer vector. The contents of this parameter are written to 4734 /// either the upper or the lower 128 bits of the result depending on the 4735 /// value of parameter \a M. 4736 /// \param M 4737 /// An immediate integer. The least significant bit determines how the values 4738 /// from the two parameters are interleaved: \n 4739 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4740 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4741 /// result. \n 4742 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4743 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4744 /// result. 4745 /// \returns A 256-bit integer vector containing the interleaved values. 4746 #define _mm256_insertf128_si256(V1, V2, M) \ 4747 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ 4748 (__v4si)(__m128i)(V2), (int)(M))) 4749 4750 /* 4751 Vector extract. 4752 We use macros rather than inlines because we only want to accept 4753 invocations where the immediate M is a constant expression. 4754 */ 4755 /// Extracts either the upper or the lower 128 bits from a 256-bit vector 4756 /// of [8 x float], as determined by the immediate integer parameter, and 4757 /// returns the extracted bits as a 128-bit vector of [4 x float]. 4758 /// 4759 /// \headerfile <x86intrin.h> 4760 /// 4761 /// \code 4762 /// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4763 /// \endcode 4764 /// 4765 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4766 /// 4767 /// \param V 4768 /// A 256-bit vector of [8 x float]. 4769 /// \param M 4770 /// An immediate integer. The least significant bit determines which bits are 4771 /// extracted from the first parameter: \n 4772 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4773 /// result. \n 4774 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4775 /// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4776 #define _mm256_extractf128_ps(V, M) \ 4777 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))) 4778 4779 /// Extracts either the upper or the lower 128 bits from a 256-bit vector 4780 /// of [4 x double], as determined by the immediate integer parameter, and 4781 /// returns the extracted bits as a 128-bit vector of [2 x double]. 4782 /// 4783 /// \headerfile <x86intrin.h> 4784 /// 4785 /// \code 4786 /// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4787 /// \endcode 4788 /// 4789 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4790 /// 4791 /// \param V 4792 /// A 256-bit vector of [4 x double]. 4793 /// \param M 4794 /// An immediate integer. The least significant bit determines which bits are 4795 /// extracted from the first parameter: \n 4796 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4797 /// result. \n 4798 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4799 /// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4800 #define _mm256_extractf128_pd(V, M) \ 4801 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))) 4802 4803 /// Extracts either the upper or the lower 128 bits from a 256-bit 4804 /// integer vector, as determined by the immediate integer parameter, and 4805 /// returns the extracted bits as a 128-bit integer vector. 4806 /// 4807 /// \headerfile <x86intrin.h> 4808 /// 4809 /// \code 4810 /// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4811 /// \endcode 4812 /// 4813 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4814 /// 4815 /// \param V 4816 /// A 256-bit integer vector. 4817 /// \param M 4818 /// An immediate integer. The least significant bit determines which bits are 4819 /// extracted from the first parameter: \n 4820 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4821 /// result. \n 4822 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4823 /// \returns A 128-bit integer vector containing the extracted bits. 4824 #define _mm256_extractf128_si256(V, M) \ 4825 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))) 4826 4827 /// Constructs a 256-bit floating-point vector of [8 x float] by 4828 /// concatenating two 128-bit floating-point vectors of [4 x float]. 4829 /// 4830 /// \headerfile <x86intrin.h> 4831 /// 4832 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4833 /// 4834 /// \param __hi 4835 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4836 /// 128 bits of the result. 4837 /// \param __lo 4838 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4839 /// 128 bits of the result. 4840 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4841 /// concatenated result. 4842 static __inline __m256 __DEFAULT_FN_ATTRS 4843 _mm256_set_m128 (__m128 __hi, __m128 __lo) 4844 { 4845 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 4846 } 4847 4848 /// Constructs a 256-bit floating-point vector of [4 x double] by 4849 /// concatenating two 128-bit floating-point vectors of [2 x double]. 4850 /// 4851 /// \headerfile <x86intrin.h> 4852 /// 4853 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4854 /// 4855 /// \param __hi 4856 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4857 /// 128 bits of the result. 4858 /// \param __lo 4859 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4860 /// 128 bits of the result. 4861 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4862 /// concatenated result. 4863 static __inline __m256d __DEFAULT_FN_ATTRS 4864 _mm256_set_m128d (__m128d __hi, __m128d __lo) 4865 { 4866 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); 4867 } 4868 4869 /// Constructs a 256-bit integer vector by concatenating two 128-bit 4870 /// integer vectors. 4871 /// 4872 /// \headerfile <x86intrin.h> 4873 /// 4874 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4875 /// 4876 /// \param __hi 4877 /// A 128-bit integer vector to be copied to the upper 128 bits of the 4878 /// result. 4879 /// \param __lo 4880 /// A 128-bit integer vector to be copied to the lower 128 bits of the 4881 /// result. 4882 /// \returns A 256-bit integer vector containing the concatenated result. 4883 static __inline __m256i __DEFAULT_FN_ATTRS 4884 _mm256_set_m128i (__m128i __hi, __m128i __lo) 4885 { 4886 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); 4887 } 4888 4889 /// Constructs a 256-bit floating-point vector of [8 x float] by 4890 /// concatenating two 128-bit floating-point vectors of [4 x float]. This is 4891 /// similar to _mm256_set_m128, but the order of the input parameters is 4892 /// swapped. 4893 /// 4894 /// \headerfile <x86intrin.h> 4895 /// 4896 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4897 /// 4898 /// \param __lo 4899 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4900 /// 128 bits of the result. 4901 /// \param __hi 4902 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4903 /// 128 bits of the result. 4904 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4905 /// concatenated result. 4906 static __inline __m256 __DEFAULT_FN_ATTRS 4907 _mm256_setr_m128 (__m128 __lo, __m128 __hi) 4908 { 4909 return _mm256_set_m128(__hi, __lo); 4910 } 4911 4912 /// Constructs a 256-bit floating-point vector of [4 x double] by 4913 /// concatenating two 128-bit floating-point vectors of [2 x double]. This is 4914 /// similar to _mm256_set_m128d, but the order of the input parameters is 4915 /// swapped. 4916 /// 4917 /// \headerfile <x86intrin.h> 4918 /// 4919 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4920 /// 4921 /// \param __lo 4922 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4923 /// 128 bits of the result. 4924 /// \param __hi 4925 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4926 /// 128 bits of the result. 4927 /// \returns A 256-bit floating-point vector of [4 x double] containing the 4928 /// concatenated result. 4929 static __inline __m256d __DEFAULT_FN_ATTRS 4930 _mm256_setr_m128d (__m128d __lo, __m128d __hi) 4931 { 4932 return (__m256d)_mm256_set_m128d(__hi, __lo); 4933 } 4934 4935 /// Constructs a 256-bit integer vector by concatenating two 128-bit 4936 /// integer vectors. This is similar to _mm256_set_m128i, but the order of 4937 /// the input parameters is swapped. 4938 /// 4939 /// \headerfile <x86intrin.h> 4940 /// 4941 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4942 /// 4943 /// \param __lo 4944 /// A 128-bit integer vector to be copied to the lower 128 bits of the 4945 /// result. 4946 /// \param __hi 4947 /// A 128-bit integer vector to be copied to the upper 128 bits of the 4948 /// result. 4949 /// \returns A 256-bit integer vector containing the concatenated result. 4950 static __inline __m256i __DEFAULT_FN_ATTRS 4951 _mm256_setr_m128i (__m128i __lo, __m128i __hi) 4952 { 4953 return (__m256i)_mm256_set_m128i(__hi, __lo); 4954 } 4955 4956 /* SIMD load ops (unaligned) */ 4957 /// Loads two 128-bit floating-point vectors of [4 x float] from 4958 /// unaligned memory locations and constructs a 256-bit floating-point vector 4959 /// of [8 x float] by concatenating the two 128-bit vectors. 4960 /// 4961 /// \headerfile <x86intrin.h> 4962 /// 4963 /// This intrinsic corresponds to load instructions followed by the 4964 /// <c> VINSERTF128 </c> instruction. 4965 /// 4966 /// \param __addr_hi 4967 /// A pointer to a 128-bit memory location containing 4 consecutive 4968 /// single-precision floating-point values. These values are to be copied to 4969 /// bits[255:128] of the result. The address of the memory location does not 4970 /// have to be aligned. 4971 /// \param __addr_lo 4972 /// A pointer to a 128-bit memory location containing 4 consecutive 4973 /// single-precision floating-point values. These values are to be copied to 4974 /// bits[127:0] of the result. The address of the memory location does not 4975 /// have to be aligned. 4976 /// \returns A 256-bit floating-point vector of [8 x float] containing the 4977 /// concatenated result. 4978 static __inline __m256 __DEFAULT_FN_ATTRS 4979 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4980 { 4981 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo)); 4982 } 4983 4984 /// Loads two 128-bit floating-point vectors of [2 x double] from 4985 /// unaligned memory locations and constructs a 256-bit floating-point vector 4986 /// of [4 x double] by concatenating the two 128-bit vectors. 4987 /// 4988 /// \headerfile <x86intrin.h> 4989 /// 4990 /// This intrinsic corresponds to load instructions followed by the 4991 /// <c> VINSERTF128 </c> instruction. 4992 /// 4993 /// \param __addr_hi 4994 /// A pointer to a 128-bit memory location containing two consecutive 4995 /// double-precision floating-point values. These values are to be copied to 4996 /// bits[255:128] of the result. The address of the memory location does not 4997 /// have to be aligned. 4998 /// \param __addr_lo 4999 /// A pointer to a 128-bit memory location containing two consecutive 5000 /// double-precision floating-point values. These values are to be copied to 5001 /// bits[127:0] of the result. The address of the memory location does not 5002 /// have to be aligned. 5003 /// \returns A 256-bit floating-point vector of [4 x double] containing the 5004 /// concatenated result. 5005 static __inline __m256d __DEFAULT_FN_ATTRS 5006 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 5007 { 5008 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo)); 5009 } 5010 5011 /// Loads two 128-bit integer vectors from unaligned memory locations and 5012 /// constructs a 256-bit integer vector by concatenating the two 128-bit 5013 /// vectors. 5014 /// 5015 /// \headerfile <x86intrin.h> 5016 /// 5017 /// This intrinsic corresponds to load instructions followed by the 5018 /// <c> VINSERTF128 </c> instruction. 5019 /// 5020 /// \param __addr_hi 5021 /// A pointer to a 128-bit memory location containing a 128-bit integer 5022 /// vector. This vector is to be copied to bits[255:128] of the result. The 5023 /// address of the memory location does not have to be aligned. 5024 /// \param __addr_lo 5025 /// A pointer to a 128-bit memory location containing a 128-bit integer 5026 /// vector. This vector is to be copied to bits[127:0] of the result. The 5027 /// address of the memory location does not have to be aligned. 5028 /// \returns A 256-bit integer vector containing the concatenated result. 5029 static __inline __m256i __DEFAULT_FN_ATTRS 5030 _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) 5031 { 5032 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo)); 5033 } 5034 5035 /* SIMD store ops (unaligned) */ 5036 /// Stores the upper and lower 128 bits of a 256-bit floating-point 5037 /// vector of [8 x float] into two different unaligned memory locations. 5038 /// 5039 /// \headerfile <x86intrin.h> 5040 /// 5041 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5042 /// store instructions. 5043 /// 5044 /// \param __addr_hi 5045 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5046 /// copied to this memory location. The address of this memory location does 5047 /// not have to be aligned. 5048 /// \param __addr_lo 5049 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5050 /// copied to this memory location. The address of this memory location does 5051 /// not have to be aligned. 5052 /// \param __a 5053 /// A 256-bit floating-point vector of [8 x float]. 5054 static __inline void __DEFAULT_FN_ATTRS 5055 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 5056 { 5057 __m128 __v128; 5058 5059 __v128 = _mm256_castps256_ps128(__a); 5060 _mm_storeu_ps(__addr_lo, __v128); 5061 __v128 = _mm256_extractf128_ps(__a, 1); 5062 _mm_storeu_ps(__addr_hi, __v128); 5063 } 5064 5065 /// Stores the upper and lower 128 bits of a 256-bit floating-point 5066 /// vector of [4 x double] into two different unaligned memory locations. 5067 /// 5068 /// \headerfile <x86intrin.h> 5069 /// 5070 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5071 /// store instructions. 5072 /// 5073 /// \param __addr_hi 5074 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5075 /// copied to this memory location. The address of this memory location does 5076 /// not have to be aligned. 5077 /// \param __addr_lo 5078 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5079 /// copied to this memory location. The address of this memory location does 5080 /// not have to be aligned. 5081 /// \param __a 5082 /// A 256-bit floating-point vector of [4 x double]. 5083 static __inline void __DEFAULT_FN_ATTRS 5084 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 5085 { 5086 __m128d __v128; 5087 5088 __v128 = _mm256_castpd256_pd128(__a); 5089 _mm_storeu_pd(__addr_lo, __v128); 5090 __v128 = _mm256_extractf128_pd(__a, 1); 5091 _mm_storeu_pd(__addr_hi, __v128); 5092 } 5093 5094 /// Stores the upper and lower 128 bits of a 256-bit integer vector into 5095 /// two different unaligned memory locations. 5096 /// 5097 /// \headerfile <x86intrin.h> 5098 /// 5099 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 5100 /// store instructions. 5101 /// 5102 /// \param __addr_hi 5103 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 5104 /// copied to this memory location. The address of this memory location does 5105 /// not have to be aligned. 5106 /// \param __addr_lo 5107 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 5108 /// copied to this memory location. The address of this memory location does 5109 /// not have to be aligned. 5110 /// \param __a 5111 /// A 256-bit integer vector. 5112 static __inline void __DEFAULT_FN_ATTRS 5113 _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) 5114 { 5115 __m128i __v128; 5116 5117 __v128 = _mm256_castsi256_si128(__a); 5118 _mm_storeu_si128(__addr_lo, __v128); 5119 __v128 = _mm256_extractf128_si256(__a, 1); 5120 _mm_storeu_si128(__addr_hi, __v128); 5121 } 5122 5123 #undef __DEFAULT_FN_ATTRS 5124 #undef __DEFAULT_FN_ATTRS128 5125 5126 #endif /* __AVXINTRIN_H */ 5127