1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX2INTRIN_H 15 #define __AVX2INTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS256 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx2,no-evex512"), __min_vector_width__(256))) 21 #define __DEFAULT_FN_ATTRS128 \ 22 __attribute__((__always_inline__, __nodebug__, \ 23 __target__("avx2,no-evex512"), __min_vector_width__(128))) 24 25 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 26 /// Computes sixteen sum of absolute difference (SAD) operations on sets of 27 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and 28 /// \a Y. 29 /// 30 /// Eight SAD results are computed using the lower half of the input 31 /// vectors, and another eight using the upper half. These 16-bit values 32 /// are returned in the lower and upper halves of the 256-bit result, 33 /// respectively. 34 /// 35 /// A single SAD operation selects four bytes from \a X and four bytes from 36 /// \a Y as input. It computes the differences between each \a X byte and 37 /// the corresponding \a Y byte, takes the absolute value of each 38 /// difference, and sums these four values to form one 16-bit result. The 39 /// intrinsic computes 16 of these results with different sets of input 40 /// bytes. 41 /// 42 /// For each set of eight results, the SAD operations use the same four 43 /// bytes from \a Y; the starting bit position for these four bytes is 44 /// specified by \a M[1:0] times 32. The eight operations use successive 45 /// sets of four bytes from \a X; the starting bit position for the first 46 /// set of four bytes is specified by \a M[2] times 32. These bit positions 47 /// are all relative to the 128-bit lane for each set of eight operations. 48 /// 49 /// \code{.operation} 50 /// r := 0 51 /// FOR i := 0 TO 1 52 /// j := i*3 53 /// Ybase := M[j+1:j]*32 + i*128 54 /// Xbase := M[j+2]*32 + i*128 55 /// FOR k := 0 TO 3 56 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase]) 57 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8]) 58 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16]) 59 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24]) 60 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3 61 /// Xbase := Xbase + 8 62 /// r := r + 16 63 /// ENDFOR 64 /// ENDFOR 65 /// \endcode 66 /// 67 /// \headerfile <immintrin.h> 68 /// 69 /// \code 70 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M); 71 /// \endcode 72 /// 73 /// This intrinsic corresponds to the \c VMPSADBW instruction. 74 /// 75 /// \param X 76 /// A 256-bit integer vector containing one of the inputs. 77 /// \param Y 78 /// A 256-bit integer vector containing one of the inputs. 79 /// \param M 80 /// An unsigned immediate value specifying the starting positions of the 81 /// bytes to operate on. 82 /// \returns A 256-bit vector of [16 x i16] containing the result. 83 #define _mm256_mpsadbw_epu8(X, Y, M) \ 84 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ 85 (__v32qi)(__m256i)(Y), (int)(M))) 86 87 /// Computes the absolute value of each signed byte in the 256-bit integer 88 /// vector \a __a and returns each value in the corresponding byte of 89 /// the result. 90 /// 91 /// \headerfile <immintrin.h> 92 /// 93 /// This intrinsic corresponds to the \c VPABSB instruction. 94 /// 95 /// \param __a 96 /// A 256-bit integer vector. 97 /// \returns A 256-bit integer vector containing the result. 98 static __inline__ __m256i __DEFAULT_FN_ATTRS256 99 _mm256_abs_epi8(__m256i __a) 100 { 101 return (__m256i)__builtin_elementwise_abs((__v32qs)__a); 102 } 103 104 /// Computes the absolute value of each signed 16-bit element in the 256-bit 105 /// vector of [16 x i16] in \a __a and returns each value in the 106 /// corresponding element of the result. 107 /// 108 /// \headerfile <immintrin.h> 109 /// 110 /// This intrinsic corresponds to the \c VPABSW instruction. 111 /// 112 /// \param __a 113 /// A 256-bit vector of [16 x i16]. 114 /// \returns A 256-bit vector of [16 x i16] containing the result. 115 static __inline__ __m256i __DEFAULT_FN_ATTRS256 116 _mm256_abs_epi16(__m256i __a) 117 { 118 return (__m256i)__builtin_elementwise_abs((__v16hi)__a); 119 } 120 121 /// Computes the absolute value of each signed 32-bit element in the 256-bit 122 /// vector of [8 x i32] in \a __a and returns each value in the 123 /// corresponding element of the result. 124 /// 125 /// \headerfile <immintrin.h> 126 /// 127 /// This intrinsic corresponds to the \c VPABSD instruction. 128 /// 129 /// \param __a 130 /// A 256-bit vector of [8 x i32]. 131 /// \returns A 256-bit vector of [8 x i32] containing the result. 132 static __inline__ __m256i __DEFAULT_FN_ATTRS256 133 _mm256_abs_epi32(__m256i __a) 134 { 135 return (__m256i)__builtin_elementwise_abs((__v8si)__a); 136 } 137 138 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit 139 /// integers using signed saturation, and returns the 256-bit result. 140 /// 141 /// \code{.operation} 142 /// FOR i := 0 TO 7 143 /// j := i*16 144 /// k := i*8 145 /// result[7+k:k] := SATURATE8(__a[15+j:j]) 146 /// result[71+k:64+k] := SATURATE8(__b[15+j:j]) 147 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j]) 148 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j]) 149 /// ENDFOR 150 /// \endcode 151 /// 152 /// \headerfile <immintrin.h> 153 /// 154 /// This intrinsic corresponds to the \c VPACKSSWB instruction. 155 /// 156 /// \param __a 157 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 158 /// result[191:128]. 159 /// \param __b 160 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 161 /// result[255:192]. 162 /// \returns A 256-bit integer vector containing the result. 163 static __inline__ __m256i __DEFAULT_FN_ATTRS256 164 _mm256_packs_epi16(__m256i __a, __m256i __b) 165 { 166 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 167 } 168 169 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit 170 /// integers using signed saturation, and returns the resulting 256-bit 171 /// vector of [16 x i16]. 172 /// 173 /// \code{.operation} 174 /// FOR i := 0 TO 3 175 /// j := i*32 176 /// k := i*16 177 /// result[15+k:k] := SATURATE16(__a[31+j:j]) 178 /// result[79+k:64+k] := SATURATE16(__b[31+j:j]) 179 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j]) 180 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j]) 181 /// ENDFOR 182 /// \endcode 183 /// 184 /// \headerfile <immintrin.h> 185 /// 186 /// This intrinsic corresponds to the \c VPACKSSDW instruction. 187 /// 188 /// \param __a 189 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 190 /// result[191:128]. 191 /// \param __b 192 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 193 /// result[255:192]. 194 /// \returns A 256-bit vector of [16 x i16] containing the result. 195 static __inline__ __m256i __DEFAULT_FN_ATTRS256 196 _mm256_packs_epi32(__m256i __a, __m256i __b) 197 { 198 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 199 } 200 201 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers 202 /// using unsigned saturation, and returns the 256-bit result. 203 /// 204 /// \code{.operation} 205 /// FOR i := 0 TO 7 206 /// j := i*16 207 /// k := i*8 208 /// result[7+k:k] := SATURATE8U(__a[15+j:j]) 209 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j]) 210 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j]) 211 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j]) 212 /// ENDFOR 213 /// \endcode 214 /// 215 /// \headerfile <immintrin.h> 216 /// 217 /// This intrinsic corresponds to the \c VPACKUSWB instruction. 218 /// 219 /// \param __a 220 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 221 /// result[191:128]. 222 /// \param __b 223 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 224 /// result[255:192]. 225 /// \returns A 256-bit integer vector containing the result. 226 static __inline__ __m256i __DEFAULT_FN_ATTRS256 227 _mm256_packus_epi16(__m256i __a, __m256i __b) 228 { 229 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 230 } 231 232 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers 233 /// using unsigned saturation, and returns the resulting 256-bit vector of 234 /// [16 x i16]. 235 /// 236 /// \code{.operation} 237 /// FOR i := 0 TO 3 238 /// j := i*32 239 /// k := i*16 240 /// result[15+k:k] := SATURATE16U(__V1[31+j:j]) 241 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j]) 242 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j]) 243 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j]) 244 /// ENDFOR 245 /// \endcode 246 /// 247 /// \headerfile <immintrin.h> 248 /// 249 /// This intrinsic corresponds to the \c VPACKUSDW instruction. 250 /// 251 /// \param __V1 252 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 253 /// result[191:128]. 254 /// \param __V2 255 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 256 /// result[255:192]. 257 /// \returns A 256-bit vector of [16 x i16] containing the result. 258 static __inline__ __m256i __DEFAULT_FN_ATTRS256 259 _mm256_packus_epi32(__m256i __V1, __m256i __V2) 260 { 261 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 262 } 263 264 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 265 /// vectors and returns the lower 8 bits of each sum in the corresponding 266 /// byte of the 256-bit integer vector result (overflow is ignored). 267 /// 268 /// \headerfile <immintrin.h> 269 /// 270 /// This intrinsic corresponds to the \c VPADDB instruction. 271 /// 272 /// \param __a 273 /// A 256-bit integer vector containing one of the source operands. 274 /// \param __b 275 /// A 256-bit integer vector containing one of the source operands. 276 /// \returns A 256-bit integer vector containing the sums. 277 static __inline__ __m256i __DEFAULT_FN_ATTRS256 278 _mm256_add_epi8(__m256i __a, __m256i __b) 279 { 280 return (__m256i)((__v32qu)__a + (__v32qu)__b); 281 } 282 283 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 284 /// [16 x i16] and returns the lower 16 bits of each sum in the 285 /// corresponding element of the [16 x i16] result (overflow is ignored). 286 /// 287 /// \headerfile <immintrin.h> 288 /// 289 /// This intrinsic corresponds to the \c VPADDW instruction. 290 /// 291 /// \param __a 292 /// A 256-bit vector of [16 x i16] containing one of the source operands. 293 /// \param __b 294 /// A 256-bit vector of [16 x i16] containing one of the source operands. 295 /// \returns A 256-bit vector of [16 x i16] containing the sums. 296 static __inline__ __m256i __DEFAULT_FN_ATTRS256 297 _mm256_add_epi16(__m256i __a, __m256i __b) 298 { 299 return (__m256i)((__v16hu)__a + (__v16hu)__b); 300 } 301 302 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of 303 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding 304 /// element of the [8 x i32] result (overflow is ignored). 305 /// 306 /// \headerfile <immintrin.h> 307 /// 308 /// This intrinsic corresponds to the \c VPADDD instruction. 309 /// 310 /// \param __a 311 /// A 256-bit vector of [8 x i32] containing one of the source operands. 312 /// \param __b 313 /// A 256-bit vector of [8 x i32] containing one of the source operands. 314 /// \returns A 256-bit vector of [8 x i32] containing the sums. 315 static __inline__ __m256i __DEFAULT_FN_ATTRS256 316 _mm256_add_epi32(__m256i __a, __m256i __b) 317 { 318 return (__m256i)((__v8su)__a + (__v8su)__b); 319 } 320 321 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of 322 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding 323 /// element of the [4 x i64] result (overflow is ignored). 324 /// 325 /// \headerfile <immintrin.h> 326 /// 327 /// This intrinsic corresponds to the \c VPADDQ instruction. 328 /// 329 /// \param __a 330 /// A 256-bit vector of [4 x i64] containing one of the source operands. 331 /// \param __b 332 /// A 256-bit vector of [4 x i64] containing one of the source operands. 333 /// \returns A 256-bit vector of [4 x i64] containing the sums. 334 static __inline__ __m256i __DEFAULT_FN_ATTRS256 335 _mm256_add_epi64(__m256i __a, __m256i __b) 336 { 337 return (__m256i)((__v4du)__a + (__v4du)__b); 338 } 339 340 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 341 /// vectors using signed saturation, and returns each sum in the 342 /// corresponding byte of the 256-bit integer vector result. 343 /// 344 /// \headerfile <immintrin.h> 345 /// 346 /// This intrinsic corresponds to the \c VPADDSB instruction. 347 /// 348 /// \param __a 349 /// A 256-bit integer vector containing one of the source operands. 350 /// \param __b 351 /// A 256-bit integer vector containing one of the source operands. 352 /// \returns A 256-bit integer vector containing the sums. 353 static __inline__ __m256i __DEFAULT_FN_ATTRS256 354 _mm256_adds_epi8(__m256i __a, __m256i __b) 355 { 356 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); 357 } 358 359 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 360 /// [16 x i16] using signed saturation, and returns the [16 x i16] result. 361 /// 362 /// \headerfile <immintrin.h> 363 /// 364 /// This intrinsic corresponds to the \c VPADDSW instruction. 365 /// 366 /// \param __a 367 /// A 256-bit vector of [16 x i16] containing one of the source operands. 368 /// \param __b 369 /// A 256-bit vector of [16 x i16] containing one of the source operands. 370 /// \returns A 256-bit vector of [16 x i16] containing the sums. 371 static __inline__ __m256i __DEFAULT_FN_ATTRS256 372 _mm256_adds_epi16(__m256i __a, __m256i __b) 373 { 374 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); 375 } 376 377 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 378 /// vectors using unsigned saturation, and returns each sum in the 379 /// corresponding byte of the 256-bit integer vector result. 380 /// 381 /// \headerfile <immintrin.h> 382 /// 383 /// This intrinsic corresponds to the \c VPADDUSB instruction. 384 /// 385 /// \param __a 386 /// A 256-bit integer vector containing one of the source operands. 387 /// \param __b 388 /// A 256-bit integer vector containing one of the source operands. 389 /// \returns A 256-bit integer vector containing the sums. 390 static __inline__ __m256i __DEFAULT_FN_ATTRS256 391 _mm256_adds_epu8(__m256i __a, __m256i __b) 392 { 393 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); 394 } 395 396 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 397 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result. 398 /// 399 /// \headerfile <immintrin.h> 400 /// 401 /// This intrinsic corresponds to the \c VPADDUSW instruction. 402 /// 403 /// \param __a 404 /// A 256-bit vector of [16 x i16] containing one of the source operands. 405 /// \param __b 406 /// A 256-bit vector of [16 x i16] containing one of the source operands. 407 /// \returns A 256-bit vector of [16 x i16] containing the sums. 408 static __inline__ __m256i __DEFAULT_FN_ATTRS256 409 _mm256_adds_epu16(__m256i __a, __m256i __b) 410 { 411 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); 412 } 413 414 /// Uses the lower half of the 256-bit vector \a a as the upper half of a 415 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b 416 /// as the lower half of the temporary value. Right-shifts the temporary 417 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value 418 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and 419 /// \a b to make another temporary value, right shifts by \a n, and uses 420 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the 421 /// result. 422 /// 423 /// \headerfile <immintrin.h> 424 /// 425 /// \code 426 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n); 427 /// \endcode 428 /// 429 /// This intrinsic corresponds to the \c VPALIGNR instruction. 430 /// 431 /// \param a 432 /// A 256-bit integer vector containing source values. 433 /// \param b 434 /// A 256-bit integer vector containing source values. 435 /// \param n 436 /// An immediate value specifying the number of bytes to shift. 437 /// \returns A 256-bit integer vector containing the result. 438 #define _mm256_alignr_epi8(a, b, n) \ 439 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ 440 (__v32qi)(__m256i)(b), (n))) 441 442 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and 443 /// \a __b. 444 /// 445 /// \headerfile <immintrin.h> 446 /// 447 /// This intrinsic corresponds to the \c VPAND instruction. 448 /// 449 /// \param __a 450 /// A 256-bit integer vector. 451 /// \param __b 452 /// A 256-bit integer vector. 453 /// \returns A 256-bit integer vector containing the result. 454 static __inline__ __m256i __DEFAULT_FN_ATTRS256 455 _mm256_and_si256(__m256i __a, __m256i __b) 456 { 457 return (__m256i)((__v4du)__a & (__v4du)__b); 458 } 459 460 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with 461 /// the bitwise NOT of the 256-bit integer vector in \a __a. 462 /// 463 /// \headerfile <immintrin.h> 464 /// 465 /// This intrinsic corresponds to the \c VPANDN instruction. 466 /// 467 /// \param __a 468 /// A 256-bit integer vector. 469 /// \param __b 470 /// A 256-bit integer vector. 471 /// \returns A 256-bit integer vector containing the result. 472 static __inline__ __m256i __DEFAULT_FN_ATTRS256 473 _mm256_andnot_si256(__m256i __a, __m256i __b) 474 { 475 return (__m256i)(~(__v4du)__a & (__v4du)__b); 476 } 477 478 /// Computes the averages of the corresponding unsigned bytes in the two 479 /// 256-bit integer vectors in \a __a and \a __b and returns each 480 /// average in the corresponding byte of the 256-bit result. 481 /// 482 /// \code{.operation} 483 /// FOR i := 0 TO 31 484 /// j := i*8 485 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1 486 /// ENDFOR 487 /// \endcode 488 /// 489 /// \headerfile <immintrin.h> 490 /// 491 /// This intrinsic corresponds to the \c VPAVGB instruction. 492 /// 493 /// \param __a 494 /// A 256-bit integer vector. 495 /// \param __b 496 /// A 256-bit integer vector. 497 /// \returns A 256-bit integer vector containing the result. 498 static __inline__ __m256i __DEFAULT_FN_ATTRS256 499 _mm256_avg_epu8(__m256i __a, __m256i __b) 500 { 501 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); 502 } 503 504 /// Computes the averages of the corresponding unsigned 16-bit integers in 505 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns 506 /// each average in the corresponding element of the 256-bit result. 507 /// 508 /// \code{.operation} 509 /// FOR i := 0 TO 15 510 /// j := i*16 511 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1 512 /// ENDFOR 513 /// \endcode 514 /// 515 /// \headerfile <immintrin.h> 516 /// 517 /// This intrinsic corresponds to the \c VPAVGW instruction. 518 /// 519 /// \param __a 520 /// A 256-bit vector of [16 x i16]. 521 /// \param __b 522 /// A 256-bit vector of [16 x i16]. 523 /// \returns A 256-bit vector of [16 x i16] containing the result. 524 static __inline__ __m256i __DEFAULT_FN_ATTRS256 525 _mm256_avg_epu16(__m256i __a, __m256i __b) 526 { 527 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); 528 } 529 530 /// Merges 8-bit integer values from either of the two 256-bit vectors 531 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns 532 /// the resulting 256-bit integer vector. 533 /// 534 /// \code{.operation} 535 /// FOR i := 0 TO 31 536 /// j := i*8 537 /// IF __M[7+i] == 0 538 /// result[7+j:j] := __V1[7+j:j] 539 /// ELSE 540 /// result[7+j:j] := __V2[7+j:j] 541 /// FI 542 /// ENDFOR 543 /// \endcode 544 /// 545 /// \headerfile <immintrin.h> 546 /// 547 /// This intrinsic corresponds to the \c VPBLENDVB instruction. 548 /// 549 /// \param __V1 550 /// A 256-bit integer vector containing source values. 551 /// \param __V2 552 /// A 256-bit integer vector containing source values. 553 /// \param __M 554 /// A 256-bit integer vector, with bit [7] of each byte specifying the 555 /// source for each corresponding byte of the result. When the mask bit 556 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from 557 /// \a __V2. 558 /// \returns A 256-bit integer vector containing the result. 559 static __inline__ __m256i __DEFAULT_FN_ATTRS256 560 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 561 { 562 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 563 (__v32qi)__M); 564 } 565 566 /// Merges 16-bit integer values from either of the two 256-bit vectors 567 /// \a V1 or \a V2, as specified by the immediate integer operand \a M, 568 /// and returns the resulting 256-bit vector of [16 x i16]. 569 /// 570 /// \code{.operation} 571 /// FOR i := 0 TO 7 572 /// j := i*16 573 /// IF M[i] == 0 574 /// result[7+j:j] := V1[7+j:j] 575 /// result[135+j:128+j] := V1[135+j:128+j] 576 /// ELSE 577 /// result[7+j:j] := V2[7+j:j] 578 /// result[135+j:128+j] := V2[135+j:128+j] 579 /// FI 580 /// ENDFOR 581 /// \endcode 582 /// 583 /// \headerfile <immintrin.h> 584 /// 585 /// \code 586 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M); 587 /// \endcode 588 /// 589 /// This intrinsic corresponds to the \c VPBLENDW instruction. 590 /// 591 /// \param V1 592 /// A 256-bit vector of [16 x i16] containing source values. 593 /// \param V2 594 /// A 256-bit vector of [16 x i16] containing source values. 595 /// \param M 596 /// An immediate 8-bit integer operand, with bits [7:0] specifying the 597 /// source for each element of the result. The position of the mask bit 598 /// corresponds to the index of a copied value. When a mask bit is 0, the 599 /// element is copied from \a V1; otherwise, it is copied from \a V2. 600 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for 601 /// elements 1 and 9, and so forth. 602 /// \returns A 256-bit vector of [16 x i16] containing the result. 603 #define _mm256_blend_epi16(V1, V2, M) \ 604 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ 605 (__v16hi)(__m256i)(V2), (int)(M))) 606 607 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and 608 /// \a __b for equality and returns the outcomes in the corresponding 609 /// bytes of the 256-bit result. 610 /// 611 /// \code{.operation} 612 /// FOR i := 0 TO 31 613 /// j := i*8 614 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0 615 /// ENDFOR 616 /// \endcode 617 /// 618 /// \headerfile <immintrin.h> 619 /// 620 /// This intrinsic corresponds to the \c VPCMPEQB instruction. 621 /// 622 /// \param __a 623 /// A 256-bit integer vector containing one of the inputs. 624 /// \param __b 625 /// A 256-bit integer vector containing one of the inputs. 626 /// \returns A 256-bit integer vector containing the result. 627 static __inline__ __m256i __DEFAULT_FN_ATTRS256 628 _mm256_cmpeq_epi8(__m256i __a, __m256i __b) 629 { 630 return (__m256i)((__v32qi)__a == (__v32qi)__b); 631 } 632 633 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in 634 /// \a __a and \a __b for equality and returns the outcomes in the 635 /// corresponding elements of the 256-bit result. 636 /// 637 /// \code{.operation} 638 /// FOR i := 0 TO 15 639 /// j := i*16 640 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0 641 /// ENDFOR 642 /// \endcode 643 /// 644 /// \headerfile <immintrin.h> 645 /// 646 /// This intrinsic corresponds to the \c VPCMPEQW instruction. 647 /// 648 /// \param __a 649 /// A 256-bit vector of [16 x i16] containing one of the inputs. 650 /// \param __b 651 /// A 256-bit vector of [16 x i16] containing one of the inputs. 652 /// \returns A 256-bit vector of [16 x i16] containing the result. 653 static __inline__ __m256i __DEFAULT_FN_ATTRS256 654 _mm256_cmpeq_epi16(__m256i __a, __m256i __b) 655 { 656 return (__m256i)((__v16hi)__a == (__v16hi)__b); 657 } 658 659 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in 660 /// \a __a and \a __b for equality and returns the outcomes in the 661 /// corresponding elements of the 256-bit result. 662 /// 663 /// \code{.operation} 664 /// FOR i := 0 TO 7 665 /// j := i*32 666 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0 667 /// ENDFOR 668 /// \endcode 669 /// 670 /// \headerfile <immintrin.h> 671 /// 672 /// This intrinsic corresponds to the \c VPCMPEQD instruction. 673 /// 674 /// \param __a 675 /// A 256-bit vector of [8 x i32] containing one of the inputs. 676 /// \param __b 677 /// A 256-bit vector of [8 x i32] containing one of the inputs. 678 /// \returns A 256-bit vector of [8 x i32] containing the result. 679 static __inline__ __m256i __DEFAULT_FN_ATTRS256 680 _mm256_cmpeq_epi32(__m256i __a, __m256i __b) 681 { 682 return (__m256i)((__v8si)__a == (__v8si)__b); 683 } 684 685 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in 686 /// \a __a and \a __b for equality and returns the outcomes in the 687 /// corresponding elements of the 256-bit result. 688 /// 689 /// \code{.operation} 690 /// FOR i := 0 TO 3 691 /// j := i*64 692 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 693 /// ENDFOR 694 /// \endcode 695 /// 696 /// \headerfile <immintrin.h> 697 /// 698 /// This intrinsic corresponds to the \c VPCMPEQQ instruction. 699 /// 700 /// \param __a 701 /// A 256-bit vector of [4 x i64] containing one of the inputs. 702 /// \param __b 703 /// A 256-bit vector of [4 x i64] containing one of the inputs. 704 /// \returns A 256-bit vector of [4 x i64] containing the result. 705 static __inline__ __m256i __DEFAULT_FN_ATTRS256 706 _mm256_cmpeq_epi64(__m256i __a, __m256i __b) 707 { 708 return (__m256i)((__v4di)__a == (__v4di)__b); 709 } 710 711 /// Compares corresponding signed bytes in the 256-bit integer vectors in 712 /// \a __a and \a __b for greater-than and returns the outcomes in the 713 /// corresponding bytes of the 256-bit result. 714 /// 715 /// \code{.operation} 716 /// FOR i := 0 TO 31 717 /// j := i*8 718 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0 719 /// ENDFOR 720 /// \endcode 721 /// 722 /// \headerfile <immintrin.h> 723 /// 724 /// This intrinsic corresponds to the \c VPCMPGTB instruction. 725 /// 726 /// \param __a 727 /// A 256-bit integer vector containing one of the inputs. 728 /// \param __b 729 /// A 256-bit integer vector containing one of the inputs. 730 /// \returns A 256-bit integer vector containing the result. 731 static __inline__ __m256i __DEFAULT_FN_ATTRS256 732 _mm256_cmpgt_epi8(__m256i __a, __m256i __b) 733 { 734 /* This function always performs a signed comparison, but __v32qi is a char 735 which may be signed or unsigned, so use __v32qs. */ 736 return (__m256i)((__v32qs)__a > (__v32qs)__b); 737 } 738 739 /// Compares corresponding signed elements in the 256-bit vectors of 740 /// [16 x i16] in \a __a and \a __b for greater-than and returns the 741 /// outcomes in the corresponding elements of the 256-bit result. 742 /// 743 /// \code{.operation} 744 /// FOR i := 0 TO 15 745 /// j := i*16 746 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0 747 /// ENDFOR 748 /// \endcode 749 /// 750 /// \headerfile <immintrin.h> 751 /// 752 /// This intrinsic corresponds to the \c VPCMPGTW instruction. 753 /// 754 /// \param __a 755 /// A 256-bit vector of [16 x i16] containing one of the inputs. 756 /// \param __b 757 /// A 256-bit vector of [16 x i16] containing one of the inputs. 758 /// \returns A 256-bit vector of [16 x i16] containing the result. 759 static __inline__ __m256i __DEFAULT_FN_ATTRS256 760 _mm256_cmpgt_epi16(__m256i __a, __m256i __b) 761 { 762 return (__m256i)((__v16hi)__a > (__v16hi)__b); 763 } 764 765 /// Compares corresponding signed elements in the 256-bit vectors of 766 /// [8 x i32] in \a __a and \a __b for greater-than and returns the 767 /// outcomes in the corresponding elements of the 256-bit result. 768 /// 769 /// \code{.operation} 770 /// FOR i := 0 TO 7 771 /// j := i*32 772 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0 773 /// ENDFOR 774 /// \endcode 775 /// 776 /// \headerfile <immintrin.h> 777 /// 778 /// This intrinsic corresponds to the \c VPCMPGTD instruction. 779 /// 780 /// \param __a 781 /// A 256-bit vector of [8 x i32] containing one of the inputs. 782 /// \param __b 783 /// A 256-bit vector of [8 x i32] containing one of the inputs. 784 /// \returns A 256-bit vector of [8 x i32] containing the result. 785 static __inline__ __m256i __DEFAULT_FN_ATTRS256 786 _mm256_cmpgt_epi32(__m256i __a, __m256i __b) 787 { 788 return (__m256i)((__v8si)__a > (__v8si)__b); 789 } 790 791 /// Compares corresponding signed elements in the 256-bit vectors of 792 /// [4 x i64] in \a __a and \a __b for greater-than and returns the 793 /// outcomes in the corresponding elements of the 256-bit result. 794 /// 795 /// \code{.operation} 796 /// FOR i := 0 TO 3 797 /// j := i*64 798 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 799 /// ENDFOR 800 /// \endcode 801 /// 802 /// \headerfile <immintrin.h> 803 /// 804 /// This intrinsic corresponds to the \c VPCMPGTQ instruction. 805 /// 806 /// \param __a 807 /// A 256-bit vector of [4 x i64] containing one of the inputs. 808 /// \param __b 809 /// A 256-bit vector of [4 x i64] containing one of the inputs. 810 /// \returns A 256-bit vector of [4 x i64] containing the result. 811 static __inline__ __m256i __DEFAULT_FN_ATTRS256 812 _mm256_cmpgt_epi64(__m256i __a, __m256i __b) 813 { 814 return (__m256i)((__v4di)__a > (__v4di)__b); 815 } 816 817 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 818 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an 819 /// element of the [16 x i16] result (overflow is ignored). Sums from 820 /// \a __a are returned in the lower 64 bits of each 128-bit half of the 821 /// result; sums from \a __b are returned in the upper 64 bits of each 822 /// 128-bit half of the result. 823 /// 824 /// \code{.operation} 825 /// FOR i := 0 TO 1 826 /// j := i*128 827 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16] 828 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48] 829 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80] 830 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112] 831 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16] 832 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48] 833 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80] 834 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112] 835 /// ENDFOR 836 /// \endcode 837 /// 838 /// \headerfile <immintrin.h> 839 /// 840 /// This intrinsic corresponds to the \c VPHADDW instruction. 841 /// 842 /// \param __a 843 /// A 256-bit vector of [16 x i16] containing one of the source operands. 844 /// \param __b 845 /// A 256-bit vector of [16 x i16] containing one of the source operands. 846 /// \returns A 256-bit vector of [16 x i16] containing the sums. 847 static __inline__ __m256i __DEFAULT_FN_ATTRS256 848 _mm256_hadd_epi16(__m256i __a, __m256i __b) 849 { 850 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 851 } 852 853 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit 854 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an 855 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a 856 /// are returned in the lower 64 bits of each 128-bit half of the result; 857 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half 858 /// of the result. 859 /// 860 /// \code{.operation} 861 /// FOR i := 0 TO 1 862 /// j := i*128 863 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32] 864 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96] 865 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32] 866 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96] 867 /// ENDFOR 868 /// \endcode 869 /// 870 /// \headerfile <immintrin.h> 871 /// 872 /// This intrinsic corresponds to the \c VPHADDD instruction. 873 /// 874 /// \param __a 875 /// A 256-bit vector of [8 x i32] containing one of the source operands. 876 /// \param __b 877 /// A 256-bit vector of [8 x i32] containing one of the source operands. 878 /// \returns A 256-bit vector of [8 x i32] containing the sums. 879 static __inline__ __m256i __DEFAULT_FN_ATTRS256 880 _mm256_hadd_epi32(__m256i __a, __m256i __b) 881 { 882 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 883 } 884 885 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 886 /// vectors of [16 x i16] using signed saturation and returns each sum in 887 /// an element of the [16 x i16] result. Sums from \a __a are returned in 888 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b 889 /// are returned in the upper 64 bits of each 128-bit half of the result. 890 /// 891 /// \code{.operation} 892 /// FOR i := 0 TO 1 893 /// j := i*128 894 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16]) 895 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48]) 896 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80]) 897 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112]) 898 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16]) 899 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48]) 900 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80]) 901 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112]) 902 /// ENDFOR 903 /// \endcode 904 /// 905 /// \headerfile <immintrin.h> 906 /// 907 /// This intrinsic corresponds to the \c VPHADDSW instruction. 908 /// 909 /// \param __a 910 /// A 256-bit vector of [16 x i16] containing one of the source operands. 911 /// \param __b 912 /// A 256-bit vector of [16 x i16] containing one of the source operands. 913 /// \returns A 256-bit vector of [16 x i16] containing the sums. 914 static __inline__ __m256i __DEFAULT_FN_ATTRS256 915 _mm256_hadds_epi16(__m256i __a, __m256i __b) 916 { 917 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 918 } 919 920 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 921 /// vectors of [16 x i16] and returns the lower 16 bits of each difference 922 /// in an element of the [16 x i16] result (overflow is ignored). 923 /// Differences from \a __a are returned in the lower 64 bits of each 924 /// 128-bit half of the result; differences from \a __b are returned in the 925 /// upper 64 bits of each 128-bit half of the result. 926 /// 927 /// \code{.operation} 928 /// FOR i := 0 TO 1 929 /// j := i*128 930 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16] 931 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48] 932 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80] 933 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112] 934 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16] 935 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48] 936 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80] 937 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112] 938 /// ENDFOR 939 /// \endcode 940 /// 941 /// \headerfile <immintrin.h> 942 /// 943 /// This intrinsic corresponds to the \c VPHSUBW instruction. 944 /// 945 /// \param __a 946 /// A 256-bit vector of [16 x i16] containing one of the source operands. 947 /// \param __b 948 /// A 256-bit vector of [16 x i16] containing one of the source operands. 949 /// \returns A 256-bit vector of [16 x i16] containing the differences. 950 static __inline__ __m256i __DEFAULT_FN_ATTRS256 951 _mm256_hsub_epi16(__m256i __a, __m256i __b) 952 { 953 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 954 } 955 956 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit 957 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in 958 /// an element of the [8 x i32] result (overflow is ignored). Differences 959 /// from \a __a are returned in the lower 64 bits of each 128-bit half of 960 /// the result; differences from \a __b are returned in the upper 64 bits 961 /// of each 128-bit half of the result. 962 /// 963 /// \code{.operation} 964 /// FOR i := 0 TO 1 965 /// j := i*128 966 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32] 967 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96] 968 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32] 969 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96] 970 /// ENDFOR 971 /// \endcode 972 /// 973 /// \headerfile <immintrin.h> 974 /// 975 /// This intrinsic corresponds to the \c VPHSUBD instruction. 976 /// 977 /// \param __a 978 /// A 256-bit vector of [8 x i32] containing one of the source operands. 979 /// \param __b 980 /// A 256-bit vector of [8 x i32] containing one of the source operands. 981 /// \returns A 256-bit vector of [8 x i32] containing the differences. 982 static __inline__ __m256i __DEFAULT_FN_ATTRS256 983 _mm256_hsub_epi32(__m256i __a, __m256i __b) 984 { 985 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 986 } 987 988 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 989 /// vectors of [16 x i16] using signed saturation and returns each sum in 990 /// an element of the [16 x i16] result. Differences from \a __a are 991 /// returned in the lower 64 bits of each 128-bit half of the result; 992 /// differences from \a __b are returned in the upper 64 bits of each 993 /// 128-bit half of the result. 994 /// 995 /// \code{.operation} 996 /// FOR i := 0 TO 1 997 /// j := i*128 998 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16]) 999 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48]) 1000 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80]) 1001 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112]) 1002 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16]) 1003 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48]) 1004 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80]) 1005 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112]) 1006 /// ENDFOR 1007 /// \endcode 1008 /// 1009 /// \headerfile <immintrin.h> 1010 /// 1011 /// This intrinsic corresponds to the \c VPHSUBSW instruction. 1012 /// 1013 /// \param __a 1014 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1015 /// \param __b 1016 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1017 /// \returns A 256-bit vector of [16 x i16] containing the differences. 1018 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1019 _mm256_hsubs_epi16(__m256i __a, __m256i __b) 1020 { 1021 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 1022 } 1023 1024 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a 1025 /// with the corresponding signed byte from the 256-bit integer vector in 1026 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent 1027 /// pairs of those products using signed saturation to form 16-bit sums 1028 /// returned as elements of the [16 x i16] result. 1029 /// 1030 /// \code{.operation} 1031 /// FOR i := 0 TO 15 1032 /// j := i*16 1033 /// temp1 := __a[j+7:j] * __b[j+7:j] 1034 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8] 1035 /// result[j+15:j] := SATURATE16(temp1 + temp2) 1036 /// ENDFOR 1037 /// \endcode 1038 /// 1039 /// \headerfile <immintrin.h> 1040 /// 1041 /// This intrinsic corresponds to the \c VPMADDUBSW instruction. 1042 /// 1043 /// \param __a 1044 /// A 256-bit vector containing one of the source operands. 1045 /// \param __b 1046 /// A 256-bit vector containing one of the source operands. 1047 /// \returns A 256-bit vector of [16 x i16] containing the result. 1048 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1049 _mm256_maddubs_epi16(__m256i __a, __m256i __b) 1050 { 1051 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 1052 } 1053 1054 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of 1055 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of 1056 /// those products to form 32-bit sums returned as elements of the 1057 /// [8 x i32] result. 1058 /// 1059 /// There is only one wraparound case: when all four of the 16-bit sources 1060 /// are \c 0x8000, the result will be \c 0x80000000. 1061 /// 1062 /// \code{.operation} 1063 /// FOR i := 0 TO 7 1064 /// j := i*32 1065 /// temp1 := __a[j+15:j] * __b[j+15:j] 1066 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16] 1067 /// result[j+31:j] := temp1 + temp2 1068 /// ENDFOR 1069 /// \endcode 1070 /// 1071 /// \headerfile <immintrin.h> 1072 /// 1073 /// This intrinsic corresponds to the \c VPMADDWD instruction. 1074 /// 1075 /// \param __a 1076 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1077 /// \param __b 1078 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1079 /// \returns A 256-bit vector of [8 x i32] containing the result. 1080 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1081 _mm256_madd_epi16(__m256i __a, __m256i __b) 1082 { 1083 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 1084 } 1085 1086 /// Compares the corresponding signed bytes in the two 256-bit integer vectors 1087 /// in \a __a and \a __b and returns the larger of each pair in the 1088 /// corresponding byte of the 256-bit result. 1089 /// 1090 /// \headerfile <immintrin.h> 1091 /// 1092 /// This intrinsic corresponds to the \c VPMAXSB instruction. 1093 /// 1094 /// \param __a 1095 /// A 256-bit integer vector. 1096 /// \param __b 1097 /// A 256-bit integer vector. 1098 /// \returns A 256-bit integer vector containing the result. 1099 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1100 _mm256_max_epi8(__m256i __a, __m256i __b) 1101 { 1102 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); 1103 } 1104 1105 /// Compares the corresponding signed 16-bit integers in the two 256-bit 1106 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1107 /// each pair in the corresponding element of the 256-bit result. 1108 /// 1109 /// \headerfile <immintrin.h> 1110 /// 1111 /// This intrinsic corresponds to the \c VPMAXSW instruction. 1112 /// 1113 /// \param __a 1114 /// A 256-bit vector of [16 x i16]. 1115 /// \param __b 1116 /// A 256-bit vector of [16 x i16]. 1117 /// \returns A 256-bit vector of [16 x i16] containing the result. 1118 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1119 _mm256_max_epi16(__m256i __a, __m256i __b) 1120 { 1121 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); 1122 } 1123 1124 /// Compares the corresponding signed 32-bit integers in the two 256-bit 1125 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1126 /// each pair in the corresponding element of the 256-bit result. 1127 /// 1128 /// \headerfile <immintrin.h> 1129 /// 1130 /// This intrinsic corresponds to the \c VPMAXSD instruction. 1131 /// 1132 /// \param __a 1133 /// A 256-bit vector of [8 x i32]. 1134 /// \param __b 1135 /// A 256-bit vector of [8 x i32]. 1136 /// \returns A 256-bit vector of [8 x i32] containing the result. 1137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1138 _mm256_max_epi32(__m256i __a, __m256i __b) 1139 { 1140 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); 1141 } 1142 1143 /// Compares the corresponding unsigned bytes in the two 256-bit integer 1144 /// vectors in \a __a and \a __b and returns the larger of each pair in 1145 /// the corresponding byte of the 256-bit result. 1146 /// 1147 /// \headerfile <immintrin.h> 1148 /// 1149 /// This intrinsic corresponds to the \c VPMAXUB instruction. 1150 /// 1151 /// \param __a 1152 /// A 256-bit integer vector. 1153 /// \param __b 1154 /// A 256-bit integer vector. 1155 /// \returns A 256-bit integer vector containing the result. 1156 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1157 _mm256_max_epu8(__m256i __a, __m256i __b) 1158 { 1159 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); 1160 } 1161 1162 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1163 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1164 /// each pair in the corresponding element of the 256-bit result. 1165 /// 1166 /// \headerfile <immintrin.h> 1167 /// 1168 /// This intrinsic corresponds to the \c VPMAXUW instruction. 1169 /// 1170 /// \param __a 1171 /// A 256-bit vector of [16 x i16]. 1172 /// \param __b 1173 /// A 256-bit vector of [16 x i16]. 1174 /// \returns A 256-bit vector of [16 x i16] containing the result. 1175 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1176 _mm256_max_epu16(__m256i __a, __m256i __b) 1177 { 1178 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); 1179 } 1180 1181 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1182 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1183 /// each pair in the corresponding element of the 256-bit result. 1184 /// 1185 /// \headerfile <immintrin.h> 1186 /// 1187 /// This intrinsic corresponds to the \c VPMAXUD instruction. 1188 /// 1189 /// \param __a 1190 /// A 256-bit vector of [8 x i32]. 1191 /// \param __b 1192 /// A 256-bit vector of [8 x i32]. 1193 /// \returns A 256-bit vector of [8 x i32] containing the result. 1194 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1195 _mm256_max_epu32(__m256i __a, __m256i __b) 1196 { 1197 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); 1198 } 1199 1200 /// Compares the corresponding signed bytes in the two 256-bit integer vectors 1201 /// in \a __a and \a __b and returns the smaller of each pair in the 1202 /// corresponding byte of the 256-bit result. 1203 /// 1204 /// \headerfile <immintrin.h> 1205 /// 1206 /// This intrinsic corresponds to the \c VPMINSB instruction. 1207 /// 1208 /// \param __a 1209 /// A 256-bit integer vector. 1210 /// \param __b 1211 /// A 256-bit integer vector. 1212 /// \returns A 256-bit integer vector containing the result. 1213 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1214 _mm256_min_epi8(__m256i __a, __m256i __b) 1215 { 1216 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); 1217 } 1218 1219 /// Compares the corresponding signed 16-bit integers in the two 256-bit 1220 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1221 /// each pair in the corresponding element of the 256-bit result. 1222 /// 1223 /// \headerfile <immintrin.h> 1224 /// 1225 /// This intrinsic corresponds to the \c VPMINSW instruction. 1226 /// 1227 /// \param __a 1228 /// A 256-bit vector of [16 x i16]. 1229 /// \param __b 1230 /// A 256-bit vector of [16 x i16]. 1231 /// \returns A 256-bit vector of [16 x i16] containing the result. 1232 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1233 _mm256_min_epi16(__m256i __a, __m256i __b) 1234 { 1235 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); 1236 } 1237 1238 /// Compares the corresponding signed 32-bit integers in the two 256-bit 1239 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1240 /// each pair in the corresponding element of the 256-bit result. 1241 /// 1242 /// \headerfile <immintrin.h> 1243 /// 1244 /// This intrinsic corresponds to the \c VPMINSD instruction. 1245 /// 1246 /// \param __a 1247 /// A 256-bit vector of [8 x i32]. 1248 /// \param __b 1249 /// A 256-bit vector of [8 x i32]. 1250 /// \returns A 256-bit vector of [8 x i32] containing the result. 1251 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1252 _mm256_min_epi32(__m256i __a, __m256i __b) 1253 { 1254 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); 1255 } 1256 1257 /// Compares the corresponding unsigned bytes in the two 256-bit integer 1258 /// vectors in \a __a and \a __b and returns the smaller of each pair in 1259 /// the corresponding byte of the 256-bit result. 1260 /// 1261 /// \headerfile <immintrin.h> 1262 /// 1263 /// This intrinsic corresponds to the \c VPMINUB instruction. 1264 /// 1265 /// \param __a 1266 /// A 256-bit integer vector. 1267 /// \param __b 1268 /// A 256-bit integer vector. 1269 /// \returns A 256-bit integer vector containing the result. 1270 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1271 _mm256_min_epu8(__m256i __a, __m256i __b) 1272 { 1273 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); 1274 } 1275 1276 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1277 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1278 /// each pair in the corresponding element of the 256-bit result. 1279 /// 1280 /// \headerfile <immintrin.h> 1281 /// 1282 /// This intrinsic corresponds to the \c VPMINUW instruction. 1283 /// 1284 /// \param __a 1285 /// A 256-bit vector of [16 x i16]. 1286 /// \param __b 1287 /// A 256-bit vector of [16 x i16]. 1288 /// \returns A 256-bit vector of [16 x i16] containing the result. 1289 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1290 _mm256_min_epu16(__m256i __a, __m256i __b) 1291 { 1292 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); 1293 } 1294 1295 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1296 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1297 /// each pair in the corresponding element of the 256-bit result. 1298 /// 1299 /// \headerfile <immintrin.h> 1300 /// 1301 /// This intrinsic corresponds to the \c VPMINUD instruction. 1302 /// 1303 /// \param __a 1304 /// A 256-bit vector of [8 x i32]. 1305 /// \param __b 1306 /// A 256-bit vector of [8 x i32]. 1307 /// \returns A 256-bit vector of [8 x i32] containing the result. 1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1309 _mm256_min_epu32(__m256i __a, __m256i __b) 1310 { 1311 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); 1312 } 1313 1314 /// Creates a 32-bit integer mask from the most significant bit of each byte 1315 /// in the 256-bit integer vector in \a __a and returns the result. 1316 /// 1317 /// \code{.operation} 1318 /// FOR i := 0 TO 31 1319 /// j := i*8 1320 /// result[i] := __a[j+7] 1321 /// ENDFOR 1322 /// \endcode 1323 /// 1324 /// \headerfile <immintrin.h> 1325 /// 1326 /// This intrinsic corresponds to the \c VPMOVMSKB instruction. 1327 /// 1328 /// \param __a 1329 /// A 256-bit integer vector containing the source bytes. 1330 /// \returns The 32-bit integer mask. 1331 static __inline__ int __DEFAULT_FN_ATTRS256 1332 _mm256_movemask_epi8(__m256i __a) 1333 { 1334 return __builtin_ia32_pmovmskb256((__v32qi)__a); 1335 } 1336 1337 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns 1338 /// the 16-bit values in the corresponding elements of a 256-bit vector 1339 /// of [16 x i16]. 1340 /// 1341 /// \code{.operation} 1342 /// FOR i := 0 TO 15 1343 /// j := i*8 1344 /// k := i*16 1345 /// result[k+15:k] := SignExtend(__V[j+7:j]) 1346 /// ENDFOR 1347 /// \endcode 1348 /// 1349 /// \headerfile <immintrin.h> 1350 /// 1351 /// This intrinsic corresponds to the \c VPMOVSXBW instruction. 1352 /// 1353 /// \param __V 1354 /// A 128-bit integer vector containing the source bytes. 1355 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended 1356 /// values. 1357 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1358 _mm256_cvtepi8_epi16(__m128i __V) 1359 { 1360 /* This function always performs a signed extension, but __v16qi is a char 1361 which may be signed or unsigned, so use __v16qs. */ 1362 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); 1363 } 1364 1365 /// Sign-extends bytes from the lower half of the 128-bit integer vector in 1366 /// \a __V and returns the 32-bit values in the corresponding elements of a 1367 /// 256-bit vector of [8 x i32]. 1368 /// 1369 /// \code{.operation} 1370 /// FOR i := 0 TO 7 1371 /// j := i*8 1372 /// k := i*32 1373 /// result[k+31:k] := SignExtend(__V[j+7:j]) 1374 /// ENDFOR 1375 /// \endcode 1376 /// 1377 /// \headerfile <immintrin.h> 1378 /// 1379 /// This intrinsic corresponds to the \c VPMOVSXBD instruction. 1380 /// 1381 /// \param __V 1382 /// A 128-bit integer vector containing the source bytes. 1383 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1384 /// values. 1385 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1386 _mm256_cvtepi8_epi32(__m128i __V) 1387 { 1388 /* This function always performs a signed extension, but __v16qi is a char 1389 which may be signed or unsigned, so use __v16qs. */ 1390 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1391 } 1392 1393 /// Sign-extends the first four bytes from the 128-bit integer vector in 1394 /// \a __V and returns the 64-bit values in the corresponding elements of a 1395 /// 256-bit vector of [4 x i64]. 1396 /// 1397 /// \code{.operation} 1398 /// result[63:0] := SignExtend(__V[7:0]) 1399 /// result[127:64] := SignExtend(__V[15:8]) 1400 /// result[191:128] := SignExtend(__V[23:16]) 1401 /// result[255:192] := SignExtend(__V[31:24]) 1402 /// \endcode 1403 /// 1404 /// \headerfile <immintrin.h> 1405 /// 1406 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction. 1407 /// 1408 /// \param __V 1409 /// A 128-bit integer vector containing the source bytes. 1410 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1411 /// values. 1412 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1413 _mm256_cvtepi8_epi64(__m128i __V) 1414 { 1415 /* This function always performs a signed extension, but __v16qi is a char 1416 which may be signed or unsigned, so use __v16qs. */ 1417 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); 1418 } 1419 1420 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1421 /// \a __V and returns the 32-bit values in the corresponding elements of a 1422 /// 256-bit vector of [8 x i32]. 1423 /// 1424 /// \code{.operation} 1425 /// FOR i := 0 TO 7 1426 /// j := i*16 1427 /// k := i*32 1428 /// result[k+31:k] := SignExtend(__V[j+15:j]) 1429 /// ENDFOR 1430 /// \endcode 1431 /// 1432 /// \headerfile <immintrin.h> 1433 /// 1434 /// This intrinsic corresponds to the \c VPMOVSXWD instruction. 1435 /// 1436 /// \param __V 1437 /// A 128-bit vector of [8 x i16] containing the source values. 1438 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1439 /// values. 1440 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1441 _mm256_cvtepi16_epi32(__m128i __V) 1442 { 1443 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); 1444 } 1445 1446 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of 1447 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1448 /// elements of a 256-bit vector of [4 x i64]. 1449 /// 1450 /// \code{.operation} 1451 /// result[63:0] := SignExtend(__V[15:0]) 1452 /// result[127:64] := SignExtend(__V[31:16]) 1453 /// result[191:128] := SignExtend(__V[47:32]) 1454 /// result[255:192] := SignExtend(__V[64:48]) 1455 /// \endcode 1456 /// 1457 /// \headerfile <immintrin.h> 1458 /// 1459 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1460 /// 1461 /// \param __V 1462 /// A 128-bit vector of [8 x i16] containing the source values. 1463 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1464 /// values. 1465 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1466 _mm256_cvtepi16_epi64(__m128i __V) 1467 { 1468 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); 1469 } 1470 1471 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1472 /// \a __V and returns the 64-bit values in the corresponding elements of a 1473 /// 256-bit vector of [4 x i64]. 1474 /// 1475 /// \code{.operation} 1476 /// result[63:0] := SignExtend(__V[31:0]) 1477 /// result[127:64] := SignExtend(__V[63:32]) 1478 /// result[191:128] := SignExtend(__V[95:64]) 1479 /// result[255:192] := SignExtend(__V[127:96]) 1480 /// \endcode 1481 /// 1482 /// \headerfile <immintrin.h> 1483 /// 1484 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction. 1485 /// 1486 /// \param __V 1487 /// A 128-bit vector of [4 x i32] containing the source values. 1488 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1489 /// values. 1490 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1491 _mm256_cvtepi32_epi64(__m128i __V) 1492 { 1493 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); 1494 } 1495 1496 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns 1497 /// the 16-bit values in the corresponding elements of a 256-bit vector 1498 /// of [16 x i16]. 1499 /// 1500 /// \code{.operation} 1501 /// FOR i := 0 TO 15 1502 /// j := i*8 1503 /// k := i*16 1504 /// result[k+15:k] := ZeroExtend(__V[j+7:j]) 1505 /// ENDFOR 1506 /// \endcode 1507 /// 1508 /// \headerfile <immintrin.h> 1509 /// 1510 /// This intrinsic corresponds to the \c VPMOVZXBW instruction. 1511 /// 1512 /// \param __V 1513 /// A 128-bit integer vector containing the source bytes. 1514 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended 1515 /// values. 1516 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1517 _mm256_cvtepu8_epi16(__m128i __V) 1518 { 1519 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); 1520 } 1521 1522 /// Zero-extends bytes from the lower half of the 128-bit integer vector in 1523 /// \a __V and returns the 32-bit values in the corresponding elements of a 1524 /// 256-bit vector of [8 x i32]. 1525 /// 1526 /// \code{.operation} 1527 /// FOR i := 0 TO 7 1528 /// j := i*8 1529 /// k := i*32 1530 /// result[k+31:k] := ZeroExtend(__V[j+7:j]) 1531 /// ENDFOR 1532 /// \endcode 1533 /// 1534 /// \headerfile <immintrin.h> 1535 /// 1536 /// This intrinsic corresponds to the \c VPMOVZXBD instruction. 1537 /// 1538 /// \param __V 1539 /// A 128-bit integer vector containing the source bytes. 1540 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1541 /// values. 1542 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1543 _mm256_cvtepu8_epi32(__m128i __V) 1544 { 1545 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1546 } 1547 1548 /// Zero-extends the first four bytes from the 128-bit integer vector in 1549 /// \a __V and returns the 64-bit values in the corresponding elements of a 1550 /// 256-bit vector of [4 x i64]. 1551 /// 1552 /// \code{.operation} 1553 /// result[63:0] := ZeroExtend(__V[7:0]) 1554 /// result[127:64] := ZeroExtend(__V[15:8]) 1555 /// result[191:128] := ZeroExtend(__V[23:16]) 1556 /// result[255:192] := ZeroExtend(__V[31:24]) 1557 /// \endcode 1558 /// 1559 /// \headerfile <immintrin.h> 1560 /// 1561 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction. 1562 /// 1563 /// \param __V 1564 /// A 128-bit integer vector containing the source bytes. 1565 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1566 /// values. 1567 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1568 _mm256_cvtepu8_epi64(__m128i __V) 1569 { 1570 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); 1571 } 1572 1573 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1574 /// \a __V and returns the 32-bit values in the corresponding elements of a 1575 /// 256-bit vector of [8 x i32]. 1576 /// 1577 /// \code{.operation} 1578 /// FOR i := 0 TO 7 1579 /// j := i*16 1580 /// k := i*32 1581 /// result[k+31:k] := ZeroExtend(__V[j+15:j]) 1582 /// ENDFOR 1583 /// \endcode 1584 /// 1585 /// \headerfile <immintrin.h> 1586 /// 1587 /// This intrinsic corresponds to the \c VPMOVZXWD instruction. 1588 /// 1589 /// \param __V 1590 /// A 128-bit vector of [8 x i16] containing the source values. 1591 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1592 /// values. 1593 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1594 _mm256_cvtepu16_epi32(__m128i __V) 1595 { 1596 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); 1597 } 1598 1599 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of 1600 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1601 /// elements of a 256-bit vector of [4 x i64]. 1602 /// 1603 /// \code{.operation} 1604 /// result[63:0] := ZeroExtend(__V[15:0]) 1605 /// result[127:64] := ZeroExtend(__V[31:16]) 1606 /// result[191:128] := ZeroExtend(__V[47:32]) 1607 /// result[255:192] := ZeroExtend(__V[64:48]) 1608 /// \endcode 1609 /// 1610 /// \headerfile <immintrin.h> 1611 /// 1612 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1613 /// 1614 /// \param __V 1615 /// A 128-bit vector of [8 x i16] containing the source values. 1616 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1617 /// values. 1618 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1619 _mm256_cvtepu16_epi64(__m128i __V) 1620 { 1621 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); 1622 } 1623 1624 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1625 /// \a __V and returns the 64-bit values in the corresponding elements of a 1626 /// 256-bit vector of [4 x i64]. 1627 /// 1628 /// \code{.operation} 1629 /// result[63:0] := ZeroExtend(__V[31:0]) 1630 /// result[127:64] := ZeroExtend(__V[63:32]) 1631 /// result[191:128] := ZeroExtend(__V[95:64]) 1632 /// result[255:192] := ZeroExtend(__V[127:96]) 1633 /// \endcode 1634 /// 1635 /// \headerfile <immintrin.h> 1636 /// 1637 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction. 1638 /// 1639 /// \param __V 1640 /// A 128-bit vector of [4 x i32] containing the source values. 1641 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1642 /// values. 1643 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1644 _mm256_cvtepu32_epi64(__m128i __V) 1645 { 1646 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); 1647 } 1648 1649 /// Multiplies signed 32-bit integers from even-numbered elements of two 1650 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1651 /// [4 x i64] result. 1652 /// 1653 /// \code{.operation} 1654 /// result[63:0] := __a[31:0] * __b[31:0] 1655 /// result[127:64] := __a[95:64] * __b[95:64] 1656 /// result[191:128] := __a[159:128] * __b[159:128] 1657 /// result[255:192] := __a[223:192] * __b[223:192] 1658 /// \endcode 1659 /// 1660 /// \headerfile <immintrin.h> 1661 /// 1662 /// This intrinsic corresponds to the \c VPMULDQ instruction. 1663 /// 1664 /// \param __a 1665 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1666 /// \param __b 1667 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1668 /// \returns A 256-bit vector of [4 x i64] containing the products. 1669 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1670 _mm256_mul_epi32(__m256i __a, __m256i __b) 1671 { 1672 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 1673 } 1674 1675 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1676 /// [16 x i16], truncates the 32-bit results to the most significant 18 1677 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded 1678 /// product in the [16 x i16] result. 1679 /// 1680 /// \code{.operation} 1681 /// FOR i := 0 TO 15 1682 /// j := i*16 1683 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1 1684 /// result[j+15:j] := temp[16:1] 1685 /// \endcode 1686 /// 1687 /// \headerfile <immintrin.h> 1688 /// 1689 /// This intrinsic corresponds to the \c VPMULHRSW instruction. 1690 /// 1691 /// \param __a 1692 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1693 /// \param __b 1694 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1695 /// \returns A 256-bit vector of [16 x i16] containing the rounded products. 1696 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1697 _mm256_mulhrs_epi16(__m256i __a, __m256i __b) 1698 { 1699 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 1700 } 1701 1702 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of 1703 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1704 /// [16 x i16] result. 1705 /// 1706 /// \headerfile <immintrin.h> 1707 /// 1708 /// This intrinsic corresponds to the \c VPMULHUW instruction. 1709 /// 1710 /// \param __a 1711 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1712 /// \param __b 1713 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1714 /// \returns A 256-bit vector of [16 x i16] containing the products. 1715 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1716 _mm256_mulhi_epu16(__m256i __a, __m256i __b) 1717 { 1718 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 1719 } 1720 1721 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1722 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1723 /// [16 x i16] result. 1724 /// 1725 /// \headerfile <immintrin.h> 1726 /// 1727 /// This intrinsic corresponds to the \c VPMULHW instruction. 1728 /// 1729 /// \param __a 1730 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1731 /// \param __b 1732 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1733 /// \returns A 256-bit vector of [16 x i16] containing the products. 1734 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1735 _mm256_mulhi_epi16(__m256i __a, __m256i __b) 1736 { 1737 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 1738 } 1739 1740 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1741 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the 1742 /// [16 x i16] result. 1743 /// 1744 /// \headerfile <immintrin.h> 1745 /// 1746 /// This intrinsic corresponds to the \c VPMULLW instruction. 1747 /// 1748 /// \param __a 1749 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1750 /// \param __b 1751 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1752 /// \returns A 256-bit vector of [16 x i16] containing the products. 1753 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1754 _mm256_mullo_epi16(__m256i __a, __m256i __b) 1755 { 1756 return (__m256i)((__v16hu)__a * (__v16hu)__b); 1757 } 1758 1759 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of 1760 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the 1761 /// [8 x i32] result. 1762 /// 1763 /// \headerfile <immintrin.h> 1764 /// 1765 /// This intrinsic corresponds to the \c VPMULLD instruction. 1766 /// 1767 /// \param __a 1768 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1769 /// \param __b 1770 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1771 /// \returns A 256-bit vector of [8 x i32] containing the products. 1772 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1773 _mm256_mullo_epi32 (__m256i __a, __m256i __b) 1774 { 1775 return (__m256i)((__v8su)__a * (__v8su)__b); 1776 } 1777 1778 /// Multiplies unsigned 32-bit integers from even-numered elements of two 1779 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1780 /// [4 x i64] result. 1781 /// 1782 /// \code{.operation} 1783 /// result[63:0] := __a[31:0] * __b[31:0] 1784 /// result[127:64] := __a[95:64] * __b[95:64] 1785 /// result[191:128] := __a[159:128] * __b[159:128] 1786 /// result[255:192] := __a[223:192] * __b[223:192] 1787 /// \endcode 1788 /// 1789 /// \headerfile <immintrin.h> 1790 /// 1791 /// This intrinsic corresponds to the \c VPMULUDQ instruction. 1792 /// 1793 /// \param __a 1794 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1795 /// \param __b 1796 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1797 /// \returns A 256-bit vector of [4 x i64] containing the products. 1798 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1799 _mm256_mul_epu32(__m256i __a, __m256i __b) 1800 { 1801 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 1802 } 1803 1804 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and 1805 /// \a __b. 1806 /// 1807 /// \headerfile <immintrin.h> 1808 /// 1809 /// This intrinsic corresponds to the \c VPOR instruction. 1810 /// 1811 /// \param __a 1812 /// A 256-bit integer vector. 1813 /// \param __b 1814 /// A 256-bit integer vector. 1815 /// \returns A 256-bit integer vector containing the result. 1816 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1817 _mm256_or_si256(__m256i __a, __m256i __b) 1818 { 1819 return (__m256i)((__v4du)__a | (__v4du)__b); 1820 } 1821 1822 /// Computes four sum of absolute difference (SAD) operations on sets of eight 1823 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and 1824 /// \a __b. 1825 /// 1826 /// One SAD result is computed for each set of eight bytes from \a __a and 1827 /// eight bytes from \a __b. The zero-extended SAD value is returned in the 1828 /// corresponding 64-bit element of the result. 1829 /// 1830 /// A single SAD operation takes the differences between the corresponding 1831 /// bytes of \a __a and \a __b, takes the absolute value of each difference, 1832 /// and sums these eight values to form one 16-bit result. This operation 1833 /// is repeated four times with successive sets of eight bytes. 1834 /// 1835 /// \code{.operation} 1836 /// FOR i := 0 TO 3 1837 /// j := i*64 1838 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j]) 1839 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8]) 1840 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16]) 1841 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24]) 1842 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32]) 1843 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40]) 1844 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48]) 1845 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56]) 1846 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 + 1847 /// temp4 + temp5 + temp6 + temp7 1848 /// result[j+63:j+16] := 0 1849 /// ENDFOR 1850 /// \endcode 1851 /// 1852 /// \headerfile <immintrin.h> 1853 /// 1854 /// This intrinsic corresponds to the \c VPSADBW instruction. 1855 /// 1856 /// \param __a 1857 /// A 256-bit integer vector. 1858 /// \param __b 1859 /// A 256-bit integer vector. 1860 /// \returns A 256-bit integer vector containing the result. 1861 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1862 _mm256_sad_epu8(__m256i __a, __m256i __b) 1863 { 1864 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 1865 } 1866 1867 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according 1868 /// to control information in the 256-bit integer vector \a __b, and 1869 /// returns the 256-bit result. In effect there are two separate 128-bit 1870 /// shuffles in the lower and upper halves. 1871 /// 1872 /// \code{.operation} 1873 /// FOR i := 0 TO 31 1874 /// j := i*8 1875 /// IF __b[j+7] == 1 1876 /// result[j+7:j] := 0 1877 /// ELSE 1878 /// k := __b[j+3:j] * 8 1879 /// IF i > 15 1880 /// k := k + 128 1881 /// FI 1882 /// result[j+7:j] := __a[k+7:k] 1883 /// FI 1884 /// ENDFOR 1885 /// \endcode 1886 /// 1887 /// \headerfile <immintrin.h> 1888 /// 1889 /// This intrinsic corresponds to the \c VPSHUFB instruction. 1890 /// 1891 /// \param __a 1892 /// A 256-bit integer vector containing source values. 1893 /// \param __b 1894 /// A 256-bit integer vector containing control information to determine 1895 /// what goes into the corresponding byte of the result. If bit 7 of the 1896 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the 1897 /// control byte specify the index (within the same 128-bit half) of \a __a 1898 /// to copy to the result byte. 1899 /// \returns A 256-bit integer vector containing the result. 1900 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1901 _mm256_shuffle_epi8(__m256i __a, __m256i __b) 1902 { 1903 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 1904 } 1905 1906 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a 1907 /// according to control information in the integer literal \a imm, and 1908 /// returns the 256-bit result. In effect there are two parallel 128-bit 1909 /// shuffles in the lower and upper halves. 1910 /// 1911 /// \code{.operation} 1912 /// FOR i := 0 to 3 1913 /// j := i*32 1914 /// k := (imm >> i*2)[1:0] * 32 1915 /// result[j+31:j] := a[k+31:k] 1916 /// result[128+j+31:128+j] := a[128+k+31:128+k] 1917 /// ENDFOR 1918 /// \endcode 1919 /// 1920 /// \headerfile <immintrin.h> 1921 /// 1922 /// \code 1923 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm); 1924 /// \endcode 1925 /// 1926 /// This intrinsic corresponds to the \c VPSHUFB instruction. 1927 /// 1928 /// \param a 1929 /// A 256-bit vector of [8 x i32] containing source values. 1930 /// \param imm 1931 /// An immediate 8-bit value specifying which elements to copy from \a a. 1932 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the 1933 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so 1934 /// forth. 1935 /// \returns A 256-bit vector of [8 x i32] containing the result. 1936 #define _mm256_shuffle_epi32(a, imm) \ 1937 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) 1938 1939 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a 1940 /// according to control information in the integer literal \a imm, and 1941 /// returns the 256-bit result. The upper 64 bits of each 128-bit half 1942 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are 1943 /// copied from \a a unchanged. 1944 /// 1945 /// \code{.operation} 1946 /// result[63:0] := a[63:0] 1947 /// result[191:128] := a[191:128] 1948 /// FOR i := 0 TO 3 1949 /// j := i * 16 + 64 1950 /// k := (imm >> i*2)[1:0] * 16 + 64 1951 /// result[j+15:j] := a[k+15:k] 1952 /// result[128+j+15:128+j] := a[128+k+15:128+k] 1953 /// ENDFOR 1954 /// \endcode 1955 /// 1956 /// \headerfile <immintrin.h> 1957 /// 1958 /// \code 1959 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm); 1960 /// \endcode 1961 /// 1962 /// This intrinsic corresponds to the \c VPSHUFHW instruction. 1963 /// 1964 /// \param a 1965 /// A 256-bit vector of [16 x i16] containing source values. 1966 /// \param imm 1967 /// An immediate 8-bit value specifying which elements to copy from \a a. 1968 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the 1969 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so 1970 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth). 1971 /// \returns A 256-bit vector of [16 x i16] containing the result. 1972 #define _mm256_shufflehi_epi16(a, imm) \ 1973 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) 1974 1975 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a 1976 /// according to control information in the integer literal \a imm, and 1977 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each 1978 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are 1979 /// copied from \a a unchanged. 1980 /// 1981 /// \code{.operation} 1982 /// result[127:64] := a[127:64] 1983 /// result[255:192] := a[255:192] 1984 /// FOR i := 0 TO 3 1985 /// j := i * 16 1986 /// k := (imm >> i*2)[1:0] * 16 1987 /// result[j+15:j] := a[k+15:k] 1988 /// result[128+j+15:128+j] := a[128+k+15:128+k] 1989 /// ENDFOR 1990 /// \endcode 1991 /// 1992 /// \headerfile <immintrin.h> 1993 /// 1994 /// \code 1995 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm); 1996 /// \endcode 1997 /// 1998 /// This intrinsic corresponds to the \c VPSHUFLW instruction. 1999 /// 2000 /// \param a 2001 /// A 256-bit vector of [16 x i16] to use as a source of data for the 2002 /// result. 2003 /// \param imm 2004 /// An immediate 8-bit value specifying which elements to copy from \a a. 2005 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the 2006 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so 2007 /// forth. 2008 /// \returns A 256-bit vector of [16 x i16] containing the result. 2009 #define _mm256_shufflelo_epi16(a, imm) \ 2010 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) 2011 2012 /// Sets each byte of the result to the corresponding byte of the 256-bit 2013 /// integer vector in \a __a, the negative of that byte, or zero, depending 2014 /// on whether the corresponding byte of the 256-bit integer vector in 2015 /// \a __b is greater than zero, less than zero, or equal to zero, 2016 /// respectively. 2017 /// 2018 /// \headerfile <immintrin.h> 2019 /// 2020 /// This intrinsic corresponds to the \c VPSIGNB instruction. 2021 /// 2022 /// \param __a 2023 /// A 256-bit integer vector. 2024 /// \param __b 2025 /// A 256-bit integer vector]. 2026 /// \returns A 256-bit integer vector containing the result. 2027 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2028 _mm256_sign_epi8(__m256i __a, __m256i __b) 2029 { 2030 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 2031 } 2032 2033 /// Sets each element of the result to the corresponding element of the 2034 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element, 2035 /// or zero, depending on whether the corresponding element of the 256-bit 2036 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or 2037 /// equal to zero, respectively. 2038 /// 2039 /// \headerfile <immintrin.h> 2040 /// 2041 /// This intrinsic corresponds to the \c VPSIGNW instruction. 2042 /// 2043 /// \param __a 2044 /// A 256-bit vector of [16 x i16]. 2045 /// \param __b 2046 /// A 256-bit vector of [16 x i16]. 2047 /// \returns A 256-bit vector of [16 x i16] containing the result. 2048 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2049 _mm256_sign_epi16(__m256i __a, __m256i __b) 2050 { 2051 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 2052 } 2053 2054 /// Sets each element of the result to the corresponding element of the 2055 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or 2056 /// zero, depending on whether the corresponding element of the 256-bit 2057 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or 2058 /// equal to zero, respectively. 2059 /// 2060 /// \headerfile <immintrin.h> 2061 /// 2062 /// This intrinsic corresponds to the \c VPSIGND instruction. 2063 /// 2064 /// \param __a 2065 /// A 256-bit vector of [8 x i32]. 2066 /// \param __b 2067 /// A 256-bit vector of [8 x i32]. 2068 /// \returns A 256-bit vector of [8 x i32] containing the result. 2069 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2070 _mm256_sign_epi32(__m256i __a, __m256i __b) 2071 { 2072 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 2073 } 2074 2075 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2076 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2077 /// is greater than 15, the returned result is all zeroes. 2078 /// 2079 /// \headerfile <immintrin.h> 2080 /// 2081 /// \code 2082 /// __m256i _mm256_slli_si256(__m256i a, const int imm); 2083 /// \endcode 2084 /// 2085 /// This intrinsic corresponds to the \c VPSLLDQ instruction. 2086 /// 2087 /// \param a 2088 /// A 256-bit integer vector to be shifted. 2089 /// \param imm 2090 /// An unsigned immediate value specifying the shift count (in bytes). 2091 /// \returns A 256-bit integer vector containing the result. 2092 #define _mm256_slli_si256(a, imm) \ 2093 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2094 2095 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2096 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2097 /// is greater than 15, the returned result is all zeroes. 2098 /// 2099 /// \headerfile <immintrin.h> 2100 /// 2101 /// \code 2102 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm); 2103 /// \endcode 2104 /// 2105 /// This intrinsic corresponds to the \c VPSLLDQ instruction. 2106 /// 2107 /// \param a 2108 /// A 256-bit integer vector to be shifted. 2109 /// \param imm 2110 /// An unsigned immediate value specifying the shift count (in bytes). 2111 /// \returns A 256-bit integer vector containing the result. 2112 #define _mm256_bslli_epi128(a, imm) \ 2113 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2114 2115 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2116 /// left by \a __count bits, shifting in zero bits, and returns the result. 2117 /// If \a __count is greater than 15, the returned result is all zeroes. 2118 /// 2119 /// \headerfile <immintrin.h> 2120 /// 2121 /// This intrinsic corresponds to the \c VPSLLW instruction. 2122 /// 2123 /// \param __a 2124 /// A 256-bit vector of [16 x i16] to be shifted. 2125 /// \param __count 2126 /// An unsigned integer value specifying the shift count (in bits). 2127 /// \returns A 256-bit vector of [16 x i16] containing the result. 2128 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2129 _mm256_slli_epi16(__m256i __a, int __count) 2130 { 2131 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 2132 } 2133 2134 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2135 /// left by the number of bits specified by the lower 64 bits of \a __count, 2136 /// shifting in zero bits, and returns the result. If \a __count is greater 2137 /// than 15, the returned result is all zeroes. 2138 /// 2139 /// \headerfile <immintrin.h> 2140 /// 2141 /// This intrinsic corresponds to the \c VPSLLW instruction. 2142 /// 2143 /// \param __a 2144 /// A 256-bit vector of [16 x i16] to be shifted. 2145 /// \param __count 2146 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2147 /// shift count (in bits). The upper element is ignored. 2148 /// \returns A 256-bit vector of [16 x i16] containing the result. 2149 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2150 _mm256_sll_epi16(__m256i __a, __m128i __count) 2151 { 2152 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 2153 } 2154 2155 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2156 /// left by \a __count bits, shifting in zero bits, and returns the result. 2157 /// If \a __count is greater than 31, the returned result is all zeroes. 2158 /// 2159 /// \headerfile <immintrin.h> 2160 /// 2161 /// This intrinsic corresponds to the \c VPSLLD instruction. 2162 /// 2163 /// \param __a 2164 /// A 256-bit vector of [8 x i32] to be shifted. 2165 /// \param __count 2166 /// An unsigned integer value specifying the shift count (in bits). 2167 /// \returns A 256-bit vector of [8 x i32] containing the result. 2168 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2169 _mm256_slli_epi32(__m256i __a, int __count) 2170 { 2171 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 2172 } 2173 2174 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2175 /// left by the number of bits given in the lower 64 bits of \a __count, 2176 /// shifting in zero bits, and returns the result. If \a __count is greater 2177 /// than 31, the returned result is all zeroes. 2178 /// 2179 /// \headerfile <immintrin.h> 2180 /// 2181 /// This intrinsic corresponds to the \c VPSLLD instruction. 2182 /// 2183 /// \param __a 2184 /// A 256-bit vector of [8 x i32] to be shifted. 2185 /// \param __count 2186 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2187 /// shift count (in bits). The upper element is ignored. 2188 /// \returns A 256-bit vector of [8 x i32] containing the result. 2189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2190 _mm256_sll_epi32(__m256i __a, __m128i __count) 2191 { 2192 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 2193 } 2194 2195 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2196 /// left by \a __count bits, shifting in zero bits, and returns the result. 2197 /// If \a __count is greater than 63, the returned result is all zeroes. 2198 /// 2199 /// \headerfile <immintrin.h> 2200 /// 2201 /// This intrinsic corresponds to the \c VPSLLQ instruction. 2202 /// 2203 /// \param __a 2204 /// A 256-bit vector of [4 x i64] to be shifted. 2205 /// \param __count 2206 /// An unsigned integer value specifying the shift count (in bits). 2207 /// \returns A 256-bit vector of [4 x i64] containing the result. 2208 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2209 _mm256_slli_epi64(__m256i __a, int __count) 2210 { 2211 return __builtin_ia32_psllqi256((__v4di)__a, __count); 2212 } 2213 2214 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2215 /// left by the number of bits given in the lower 64 bits of \a __count, 2216 /// shifting in zero bits, and returns the result. If \a __count is greater 2217 /// than 63, the returned result is all zeroes. 2218 /// 2219 /// \headerfile <immintrin.h> 2220 /// 2221 /// This intrinsic corresponds to the \c VPSLLQ instruction. 2222 /// 2223 /// \param __a 2224 /// A 256-bit vector of [4 x i64] to be shifted. 2225 /// \param __count 2226 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2227 /// shift count (in bits). The upper element is ignored. 2228 /// \returns A 256-bit vector of [4 x i64] containing the result. 2229 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2230 _mm256_sll_epi64(__m256i __a, __m128i __count) 2231 { 2232 return __builtin_ia32_psllq256((__v4di)__a, __count); 2233 } 2234 2235 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2236 /// right by \a __count bits, shifting in sign bits, and returns the result. 2237 /// If \a __count is greater than 15, each element of the result is either 2238 /// 0 or -1 according to the corresponding input sign bit. 2239 /// 2240 /// \headerfile <immintrin.h> 2241 /// 2242 /// This intrinsic corresponds to the \c VPSRAW instruction. 2243 /// 2244 /// \param __a 2245 /// A 256-bit vector of [16 x i16] to be shifted. 2246 /// \param __count 2247 /// An unsigned integer value specifying the shift count (in bits). 2248 /// \returns A 256-bit vector of [16 x i16] containing the result. 2249 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2250 _mm256_srai_epi16(__m256i __a, int __count) 2251 { 2252 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 2253 } 2254 2255 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2256 /// right by the number of bits given in the lower 64 bits of \a __count, 2257 /// shifting in sign bits, and returns the result. If \a __count is greater 2258 /// than 15, each element of the result is either 0 or -1 according to the 2259 /// corresponding input sign bit. 2260 /// 2261 /// \headerfile <immintrin.h> 2262 /// 2263 /// This intrinsic corresponds to the \c VPSRAW instruction. 2264 /// 2265 /// \param __a 2266 /// A 256-bit vector of [16 x i16] to be shifted. 2267 /// \param __count 2268 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2269 /// shift count (in bits). The upper element is ignored. 2270 /// \returns A 256-bit vector of [16 x i16] containing the result. 2271 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2272 _mm256_sra_epi16(__m256i __a, __m128i __count) 2273 { 2274 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 2275 } 2276 2277 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2278 /// right by \a __count bits, shifting in sign bits, and returns the result. 2279 /// If \a __count is greater than 31, each element of the result is either 2280 /// 0 or -1 according to the corresponding input sign bit. 2281 /// 2282 /// \headerfile <immintrin.h> 2283 /// 2284 /// This intrinsic corresponds to the \c VPSRAD instruction. 2285 /// 2286 /// \param __a 2287 /// A 256-bit vector of [8 x i32] to be shifted. 2288 /// \param __count 2289 /// An unsigned integer value specifying the shift count (in bits). 2290 /// \returns A 256-bit vector of [8 x i32] containing the result. 2291 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2292 _mm256_srai_epi32(__m256i __a, int __count) 2293 { 2294 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 2295 } 2296 2297 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2298 /// right by the number of bits given in the lower 64 bits of \a __count, 2299 /// shifting in sign bits, and returns the result. If \a __count is greater 2300 /// than 31, each element of the result is either 0 or -1 according to the 2301 /// corresponding input sign bit. 2302 /// 2303 /// \headerfile <immintrin.h> 2304 /// 2305 /// This intrinsic corresponds to the \c VPSRAD instruction. 2306 /// 2307 /// \param __a 2308 /// A 256-bit vector of [8 x i32] to be shifted. 2309 /// \param __count 2310 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2311 /// shift count (in bits). The upper element is ignored. 2312 /// \returns A 256-bit vector of [8 x i32] containing the result. 2313 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2314 _mm256_sra_epi32(__m256i __a, __m128i __count) 2315 { 2316 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 2317 } 2318 2319 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2320 /// \a imm bytes, shifting in zero bytes, and returns the result. If 2321 /// \a imm is greater than 15, the returned result is all zeroes. 2322 /// 2323 /// \headerfile <immintrin.h> 2324 /// 2325 /// \code 2326 /// __m256i _mm256_srli_si256(__m256i a, const int imm); 2327 /// \endcode 2328 /// 2329 /// This intrinsic corresponds to the \c VPSRLDQ instruction. 2330 /// 2331 /// \param a 2332 /// A 256-bit integer vector to be shifted. 2333 /// \param imm 2334 /// An unsigned immediate value specifying the shift count (in bytes). 2335 /// \returns A 256-bit integer vector containing the result. 2336 #define _mm256_srli_si256(a, imm) \ 2337 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2338 2339 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2340 /// \a imm bytes, shifting in zero bytes, and returns the result. If 2341 /// \a imm is greater than 15, the returned result is all zeroes. 2342 /// 2343 /// \headerfile <immintrin.h> 2344 /// 2345 /// \code 2346 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm); 2347 /// \endcode 2348 /// 2349 /// This intrinsic corresponds to the \c VPSRLDQ instruction. 2350 /// 2351 /// \param a 2352 /// A 256-bit integer vector to be shifted. 2353 /// \param imm 2354 /// An unsigned immediate value specifying the shift count (in bytes). 2355 /// \returns A 256-bit integer vector containing the result. 2356 #define _mm256_bsrli_epi128(a, imm) \ 2357 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2358 2359 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2360 /// right by \a __count bits, shifting in zero bits, and returns the result. 2361 /// If \a __count is greater than 15, the returned result is all zeroes. 2362 /// 2363 /// \headerfile <immintrin.h> 2364 /// 2365 /// This intrinsic corresponds to the \c VPSRLW instruction. 2366 /// 2367 /// \param __a 2368 /// A 256-bit vector of [16 x i16] to be shifted. 2369 /// \param __count 2370 /// An unsigned integer value specifying the shift count (in bits). 2371 /// \returns A 256-bit vector of [16 x i16] containing the result. 2372 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2373 _mm256_srli_epi16(__m256i __a, int __count) 2374 { 2375 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 2376 } 2377 2378 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2379 /// right by the number of bits given in the lower 64 bits of \a __count, 2380 /// shifting in zero bits, and returns the result. If \a __count is greater 2381 /// than 15, the returned result is all zeroes. 2382 /// 2383 /// \headerfile <immintrin.h> 2384 /// 2385 /// This intrinsic corresponds to the \c VPSRLW instruction. 2386 /// 2387 /// \param __a 2388 /// A 256-bit vector of [16 x i16] to be shifted. 2389 /// \param __count 2390 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2391 /// shift count (in bits). The upper element is ignored. 2392 /// \returns A 256-bit vector of [16 x i16] containing the result. 2393 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2394 _mm256_srl_epi16(__m256i __a, __m128i __count) 2395 { 2396 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 2397 } 2398 2399 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2400 /// right by \a __count bits, shifting in zero bits, and returns the result. 2401 /// If \a __count is greater than 31, the returned result is all zeroes. 2402 /// 2403 /// \headerfile <immintrin.h> 2404 /// 2405 /// This intrinsic corresponds to the \c VPSRLD instruction. 2406 /// 2407 /// \param __a 2408 /// A 256-bit vector of [8 x i32] to be shifted. 2409 /// \param __count 2410 /// An unsigned integer value specifying the shift count (in bits). 2411 /// \returns A 256-bit vector of [8 x i32] containing the result. 2412 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2413 _mm256_srli_epi32(__m256i __a, int __count) 2414 { 2415 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 2416 } 2417 2418 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2419 /// right by the number of bits given in the lower 64 bits of \a __count, 2420 /// shifting in zero bits, and returns the result. If \a __count is greater 2421 /// than 31, the returned result is all zeroes. 2422 /// 2423 /// \headerfile <immintrin.h> 2424 /// 2425 /// This intrinsic corresponds to the \c VPSRLD instruction. 2426 /// 2427 /// \param __a 2428 /// A 256-bit vector of [8 x i32] to be shifted. 2429 /// \param __count 2430 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2431 /// shift count (in bits). The upper element is ignored. 2432 /// \returns A 256-bit vector of [8 x i32] containing the result. 2433 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2434 _mm256_srl_epi32(__m256i __a, __m128i __count) 2435 { 2436 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 2437 } 2438 2439 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2440 /// right by \a __count bits, shifting in zero bits, and returns the result. 2441 /// If \a __count is greater than 63, the returned result is all zeroes. 2442 /// 2443 /// \headerfile <immintrin.h> 2444 /// 2445 /// This intrinsic corresponds to the \c VPSRLQ instruction. 2446 /// 2447 /// \param __a 2448 /// A 256-bit vector of [4 x i64] to be shifted. 2449 /// \param __count 2450 /// An unsigned integer value specifying the shift count (in bits). 2451 /// \returns A 256-bit vector of [4 x i64] containing the result. 2452 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2453 _mm256_srli_epi64(__m256i __a, int __count) 2454 { 2455 return __builtin_ia32_psrlqi256((__v4di)__a, __count); 2456 } 2457 2458 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2459 /// right by the number of bits given in the lower 64 bits of \a __count, 2460 /// shifting in zero bits, and returns the result. If \a __count is greater 2461 /// than 63, the returned result is all zeroes. 2462 /// 2463 /// \headerfile <immintrin.h> 2464 /// 2465 /// This intrinsic corresponds to the \c VPSRLQ instruction. 2466 /// 2467 /// \param __a 2468 /// A 256-bit vector of [4 x i64] to be shifted. 2469 /// \param __count 2470 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2471 /// shift count (in bits). The upper element is ignored. 2472 /// \returns A 256-bit vector of [4 x i64] containing the result. 2473 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2474 _mm256_srl_epi64(__m256i __a, __m128i __count) 2475 { 2476 return __builtin_ia32_psrlq256((__v4di)__a, __count); 2477 } 2478 2479 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2480 /// vectors. Returns the lower 8 bits of each difference in the 2481 /// corresponding byte of the 256-bit integer vector result (overflow is 2482 /// ignored). 2483 /// 2484 /// \code{.operation} 2485 /// FOR i := 0 TO 31 2486 /// j := i*8 2487 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j] 2488 /// ENDFOR 2489 /// \endcode 2490 /// 2491 /// \headerfile <immintrin.h> 2492 /// 2493 /// This intrinsic corresponds to the \c VPSUBB instruction. 2494 /// 2495 /// \param __a 2496 /// A 256-bit integer vector containing the minuends. 2497 /// \param __b 2498 /// A 256-bit integer vector containing the subtrahends. 2499 /// \returns A 256-bit integer vector containing the differences. 2500 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2501 _mm256_sub_epi8(__m256i __a, __m256i __b) 2502 { 2503 return (__m256i)((__v32qu)__a - (__v32qu)__b); 2504 } 2505 2506 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2507 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in 2508 /// the corresponding element of the [16 x i16] result (overflow is 2509 /// ignored). 2510 /// 2511 /// \code{.operation} 2512 /// FOR i := 0 TO 15 2513 /// j := i*16 2514 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j] 2515 /// ENDFOR 2516 /// \endcode 2517 /// 2518 /// \headerfile <immintrin.h> 2519 /// 2520 /// This intrinsic corresponds to the \c VPSUBW instruction. 2521 /// 2522 /// \param __a 2523 /// A 256-bit vector of [16 x i16] containing the minuends. 2524 /// \param __b 2525 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2526 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2527 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2528 _mm256_sub_epi16(__m256i __a, __m256i __b) 2529 { 2530 return (__m256i)((__v16hu)__a - (__v16hu)__b); 2531 } 2532 2533 /// Subtracts 32-bit integers from corresponding elements of two 256-bit 2534 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in 2535 /// the corresponding element of the [8 x i32] result (overflow is ignored). 2536 /// 2537 /// \code{.operation} 2538 /// FOR i := 0 TO 7 2539 /// j := i*32 2540 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j] 2541 /// ENDFOR 2542 /// \endcode 2543 /// 2544 /// \headerfile <immintrin.h> 2545 /// 2546 /// This intrinsic corresponds to the \c VPSUBD instruction. 2547 /// 2548 /// \param __a 2549 /// A 256-bit vector of [8 x i32] containing the minuends. 2550 /// \param __b 2551 /// A 256-bit vector of [8 x i32] containing the subtrahends. 2552 /// \returns A 256-bit vector of [8 x i32] containing the differences. 2553 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2554 _mm256_sub_epi32(__m256i __a, __m256i __b) 2555 { 2556 return (__m256i)((__v8su)__a - (__v8su)__b); 2557 } 2558 2559 /// Subtracts 64-bit integers from corresponding elements of two 256-bit 2560 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in 2561 /// the corresponding element of the [4 x i64] result (overflow is ignored). 2562 /// 2563 /// \code{.operation} 2564 /// FOR i := 0 TO 3 2565 /// j := i*64 2566 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j] 2567 /// ENDFOR 2568 /// \endcode 2569 /// 2570 /// \headerfile <immintrin.h> 2571 /// 2572 /// This intrinsic corresponds to the \c VPSUBQ instruction. 2573 /// 2574 /// \param __a 2575 /// A 256-bit vector of [4 x i64] containing the minuends. 2576 /// \param __b 2577 /// A 256-bit vector of [4 x i64] containing the subtrahends. 2578 /// \returns A 256-bit vector of [4 x i64] containing the differences. 2579 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2580 _mm256_sub_epi64(__m256i __a, __m256i __b) 2581 { 2582 return (__m256i)((__v4du)__a - (__v4du)__b); 2583 } 2584 2585 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2586 /// vectors using signed saturation, and returns each differences in the 2587 /// corresponding byte of the 256-bit integer vector result. 2588 /// 2589 /// \code{.operation} 2590 /// FOR i := 0 TO 31 2591 /// j := i*8 2592 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j]) 2593 /// ENDFOR 2594 /// \endcode 2595 /// 2596 /// \headerfile <immintrin.h> 2597 /// 2598 /// This intrinsic corresponds to the \c VPSUBSB instruction. 2599 /// 2600 /// \param __a 2601 /// A 256-bit integer vector containing the minuends. 2602 /// \param __b 2603 /// A 256-bit integer vector containing the subtrahends. 2604 /// \returns A 256-bit integer vector containing the differences. 2605 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2606 _mm256_subs_epi8(__m256i __a, __m256i __b) 2607 { 2608 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); 2609 } 2610 2611 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2612 /// vectors of [16 x i16] using signed saturation, and returns each 2613 /// difference in the corresponding element of the [16 x i16] result. 2614 /// 2615 /// \code{.operation} 2616 /// FOR i := 0 TO 15 2617 /// j := i*16 2618 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j]) 2619 /// ENDFOR 2620 /// \endcode 2621 /// 2622 /// \headerfile <immintrin.h> 2623 /// 2624 /// This intrinsic corresponds to the \c VPSUBSW instruction. 2625 /// 2626 /// \param __a 2627 /// A 256-bit vector of [16 x i16] containing the minuends. 2628 /// \param __b 2629 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2630 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2631 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2632 _mm256_subs_epi16(__m256i __a, __m256i __b) 2633 { 2634 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); 2635 } 2636 2637 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2638 /// vectors using unsigned saturation, and returns each difference in the 2639 /// corresponding byte of the 256-bit integer vector result. For each byte, 2640 /// computes <c> result = __a - __b </c>. 2641 /// 2642 /// \code{.operation} 2643 /// FOR i := 0 TO 31 2644 /// j := i*8 2645 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j]) 2646 /// ENDFOR 2647 /// \endcode 2648 /// 2649 /// \headerfile <immintrin.h> 2650 /// 2651 /// This intrinsic corresponds to the \c VPSUBUSB instruction. 2652 /// 2653 /// \param __a 2654 /// A 256-bit integer vector containing the minuends. 2655 /// \param __b 2656 /// A 256-bit integer vector containing the subtrahends. 2657 /// \returns A 256-bit integer vector containing the differences. 2658 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2659 _mm256_subs_epu8(__m256i __a, __m256i __b) 2660 { 2661 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); 2662 } 2663 2664 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2665 /// vectors of [16 x i16] using unsigned saturation, and returns each 2666 /// difference in the corresponding element of the [16 x i16] result. 2667 /// 2668 /// \code{.operation} 2669 /// FOR i := 0 TO 15 2670 /// j := i*16 2671 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j]) 2672 /// ENDFOR 2673 /// \endcode 2674 /// 2675 /// \headerfile <immintrin.h> 2676 /// 2677 /// This intrinsic corresponds to the \c VPSUBUSW instruction. 2678 /// 2679 /// \param __a 2680 /// A 256-bit vector of [16 x i16] containing the minuends. 2681 /// \param __b 2682 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2683 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2684 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2685 _mm256_subs_epu16(__m256i __a, __m256i __b) 2686 { 2687 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); 2688 } 2689 2690 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2691 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2692 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as 2693 /// input; other bits in these parameters are ignored. 2694 /// 2695 /// \code{.operation} 2696 /// result[7:0] := __a[71:64] 2697 /// result[15:8] := __b[71:64] 2698 /// result[23:16] := __a[79:72] 2699 /// result[31:24] := __b[79:72] 2700 /// . . . 2701 /// result[127:120] := __b[127:120] 2702 /// result[135:128] := __a[199:192] 2703 /// . . . 2704 /// result[255:248] := __b[255:248] 2705 /// \endcode 2706 /// 2707 /// \headerfile <immintrin.h> 2708 /// 2709 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction. 2710 /// 2711 /// \param __a 2712 /// A 256-bit integer vector used as the source for the even-numbered bytes 2713 /// of the result. 2714 /// \param __b 2715 /// A 256-bit integer vector used as the source for the odd-numbered bytes 2716 /// of the result. 2717 /// \returns A 256-bit integer vector containing the result. 2718 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2719 _mm256_unpackhi_epi8(__m256i __a, __m256i __b) 2720 { 2721 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 2722 } 2723 2724 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2725 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2726 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each 2727 /// 128-bit half of \a __a and \a __b as input; other bits in these 2728 /// parameters are ignored. 2729 /// 2730 /// \code{.operation} 2731 /// result[15:0] := __a[79:64] 2732 /// result[31:16] := __b[79:64] 2733 /// result[47:32] := __a[95:80] 2734 /// result[63:48] := __b[95:80] 2735 /// . . . 2736 /// result[127:112] := __b[127:112] 2737 /// result[143:128] := __a[211:196] 2738 /// . . . 2739 /// result[255:240] := __b[255:240] 2740 /// \endcode 2741 /// 2742 /// \headerfile <immintrin.h> 2743 /// 2744 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction. 2745 /// 2746 /// \param __a 2747 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2748 /// elements of the result. 2749 /// \param __b 2750 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2751 /// elements of the result. 2752 /// \returns A 256-bit vector of [16 x i16] containing the result. 2753 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2754 _mm256_unpackhi_epi16(__m256i __a, __m256i __b) 2755 { 2756 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2757 } 2758 2759 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2760 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2761 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half 2762 /// of \a __a and \a __b as input; other bits in these parameters are 2763 /// ignored. 2764 /// 2765 /// \code{.operation} 2766 /// result[31:0] := __a[95:64] 2767 /// result[63:32] := __b[95:64] 2768 /// result[95:64] := __a[127:96] 2769 /// result[127:96] := __b[127:96] 2770 /// result[159:128] := __a[223:192] 2771 /// result[191:160] := __b[223:192] 2772 /// result[223:192] := __a[255:224] 2773 /// result[255:224] := __b[255:224] 2774 /// \endcode 2775 /// 2776 /// \headerfile <immintrin.h> 2777 /// 2778 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction. 2779 /// 2780 /// \param __a 2781 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2782 /// elements of the result. 2783 /// \param __b 2784 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2785 /// elements of the result. 2786 /// \returns A 256-bit vector of [8 x i32] containing the result. 2787 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2788 _mm256_unpackhi_epi32(__m256i __a, __m256i __b) 2789 { 2790 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 2791 } 2792 2793 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2794 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2795 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half 2796 /// of \a __a and \a __b as input; other bits in these parameters are 2797 /// ignored. 2798 /// 2799 /// \code{.operation} 2800 /// result[63:0] := __a[127:64] 2801 /// result[127:64] := __b[127:64] 2802 /// result[191:128] := __a[255:192] 2803 /// result[255:192] := __b[255:192] 2804 /// \endcode 2805 /// 2806 /// \headerfile <immintrin.h> 2807 /// 2808 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction. 2809 /// 2810 /// \param __a 2811 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2812 /// elements of the result. 2813 /// \param __b 2814 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2815 /// elements of the result. 2816 /// \returns A 256-bit vector of [4 x i64] containing the result. 2817 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2818 _mm256_unpackhi_epi64(__m256i __a, __m256i __b) 2819 { 2820 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); 2821 } 2822 2823 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2824 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2825 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as 2826 /// input; other bits in these parameters are ignored. 2827 /// 2828 /// \code{.operation} 2829 /// result[7:0] := __a[7:0] 2830 /// result[15:8] := __b[7:0] 2831 /// result[23:16] := __a[15:8] 2832 /// result[31:24] := __b[15:8] 2833 /// . . . 2834 /// result[127:120] := __b[63:56] 2835 /// result[135:128] := __a[135:128] 2836 /// . . . 2837 /// result[255:248] := __b[191:184] 2838 /// \endcode 2839 /// 2840 /// \headerfile <immintrin.h> 2841 /// 2842 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction. 2843 /// 2844 /// \param __a 2845 /// A 256-bit integer vector used as the source for the even-numbered bytes 2846 /// of the result. 2847 /// \param __b 2848 /// A 256-bit integer vector used as the source for the odd-numbered bytes 2849 /// of the result. 2850 /// \returns A 256-bit integer vector containing the result. 2851 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2852 _mm256_unpacklo_epi8(__m256i __a, __m256i __b) 2853 { 2854 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 2855 } 2856 2857 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2858 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2859 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each 2860 /// 128-bit half of \a __a and \a __b as input; other bits in these 2861 /// parameters are ignored. 2862 /// 2863 /// \code{.operation} 2864 /// result[15:0] := __a[15:0] 2865 /// result[31:16] := __b[15:0] 2866 /// result[47:32] := __a[31:16] 2867 /// result[63:48] := __b[31:16] 2868 /// . . . 2869 /// result[127:112] := __b[63:48] 2870 /// result[143:128] := __a[143:128] 2871 /// . . . 2872 /// result[255:239] := __b[191:176] 2873 /// \endcode 2874 /// 2875 /// \headerfile <immintrin.h> 2876 /// 2877 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction. 2878 /// 2879 /// \param __a 2880 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2881 /// elements of the result. 2882 /// \param __b 2883 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2884 /// elements of the result. 2885 /// \returns A 256-bit vector of [16 x i16] containing the result. 2886 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2887 _mm256_unpacklo_epi16(__m256i __a, __m256i __b) 2888 { 2889 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 2890 } 2891 2892 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2893 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2894 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half 2895 /// of \a __a and \a __b as input; other bits in these parameters are 2896 /// ignored. 2897 /// 2898 /// \code{.operation} 2899 /// result[31:0] := __a[31:0] 2900 /// result[63:32] := __b[31:0] 2901 /// result[95:64] := __a[63:32] 2902 /// result[127:96] := __b[63:32] 2903 /// result[159:128] := __a[159:128] 2904 /// result[191:160] := __b[159:128] 2905 /// result[223:192] := __a[191:160] 2906 /// result[255:224] := __b[191:190] 2907 /// \endcode 2908 /// 2909 /// \headerfile <immintrin.h> 2910 /// 2911 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction. 2912 /// 2913 /// \param __a 2914 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2915 /// elements of the result. 2916 /// \param __b 2917 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2918 /// elements of the result. 2919 /// \returns A 256-bit vector of [8 x i32] containing the result. 2920 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2921 _mm256_unpacklo_epi32(__m256i __a, __m256i __b) 2922 { 2923 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 2924 } 2925 2926 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2927 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2928 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half 2929 /// of \a __a and \a __b as input; other bits in these parameters are 2930 /// ignored. 2931 /// 2932 /// \code{.operation} 2933 /// result[63:0] := __a[63:0] 2934 /// result[127:64] := __b[63:0] 2935 /// result[191:128] := __a[191:128] 2936 /// result[255:192] := __b[191:128] 2937 /// \endcode 2938 /// 2939 /// \headerfile <immintrin.h> 2940 /// 2941 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction. 2942 /// 2943 /// \param __a 2944 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2945 /// elements of the result. 2946 /// \param __b 2947 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2948 /// elements of the result. 2949 /// \returns A 256-bit vector of [4 x i64] containing the result. 2950 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2951 _mm256_unpacklo_epi64(__m256i __a, __m256i __b) 2952 { 2953 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); 2954 } 2955 2956 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and 2957 /// \a __b. 2958 /// 2959 /// \headerfile <immintrin.h> 2960 /// 2961 /// This intrinsic corresponds to the \c VPXOR instruction. 2962 /// 2963 /// \param __a 2964 /// A 256-bit integer vector. 2965 /// \param __b 2966 /// A 256-bit integer vector. 2967 /// \returns A 256-bit integer vector containing the result. 2968 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2969 _mm256_xor_si256(__m256i __a, __m256i __b) 2970 { 2971 return (__m256i)((__v4du)__a ^ (__v4du)__b); 2972 } 2973 2974 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal 2975 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte 2976 /// boundary. 2977 /// 2978 /// \headerfile <immintrin.h> 2979 /// 2980 /// This intrinsic corresponds to the \c VMOVNTDQA instruction. 2981 /// 2982 /// \param __V 2983 /// A pointer to the 32-byte aligned memory containing the vector to load. 2984 /// \returns A 256-bit integer vector loaded from memory. 2985 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2986 _mm256_stream_load_si256(const void *__V) 2987 { 2988 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 2989 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); 2990 } 2991 2992 /// Broadcasts the 32-bit floating-point value from the low element of the 2993 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's 2994 /// 128-bit vector of [4 x float]. 2995 /// 2996 /// \headerfile <immintrin.h> 2997 /// 2998 /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 2999 /// 3000 /// \param __X 3001 /// A 128-bit vector of [4 x float] whose low element will be broadcast. 3002 /// \returns A 128-bit vector of [4 x float] containing the result. 3003 static __inline__ __m128 __DEFAULT_FN_ATTRS128 3004 _mm_broadcastss_ps(__m128 __X) 3005 { 3006 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); 3007 } 3008 3009 /// Broadcasts the 64-bit floating-point value from the low element of the 3010 /// 128-bit vector of [2 x double] in \a __a to both elements of the 3011 /// result's 128-bit vector of [2 x double]. 3012 /// 3013 /// \headerfile <immintrin.h> 3014 /// 3015 /// This intrinsic corresponds to the \c MOVDDUP instruction. 3016 /// 3017 /// \param __a 3018 /// A 128-bit vector of [2 x double] whose low element will be broadcast. 3019 /// \returns A 128-bit vector of [2 x double] containing the result. 3020 static __inline__ __m128d __DEFAULT_FN_ATTRS128 3021 _mm_broadcastsd_pd(__m128d __a) 3022 { 3023 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 3024 } 3025 3026 /// Broadcasts the 32-bit floating-point value from the low element of the 3027 /// 128-bit vector of [4 x float] in \a __X to all elements of the 3028 /// result's 256-bit vector of [8 x float]. 3029 /// 3030 /// \headerfile <immintrin.h> 3031 /// 3032 /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 3033 /// 3034 /// \param __X 3035 /// A 128-bit vector of [4 x float] whose low element will be broadcast. 3036 /// \returns A 256-bit vector of [8 x float] containing the result. 3037 static __inline__ __m256 __DEFAULT_FN_ATTRS256 3038 _mm256_broadcastss_ps(__m128 __X) 3039 { 3040 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3041 } 3042 3043 /// Broadcasts the 64-bit floating-point value from the low element of the 3044 /// 128-bit vector of [2 x double] in \a __X to all elements of the 3045 /// result's 256-bit vector of [4 x double]. 3046 /// 3047 /// \headerfile <immintrin.h> 3048 /// 3049 /// This intrinsic corresponds to the \c VBROADCASTSD instruction. 3050 /// 3051 /// \param __X 3052 /// A 128-bit vector of [2 x double] whose low element will be broadcast. 3053 /// \returns A 256-bit vector of [4 x double] containing the result. 3054 static __inline__ __m256d __DEFAULT_FN_ATTRS256 3055 _mm256_broadcastsd_pd(__m128d __X) 3056 { 3057 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); 3058 } 3059 3060 /// Broadcasts the 128-bit integer data from \a __X to both the lower and 3061 /// upper halves of the 256-bit result. 3062 /// 3063 /// \headerfile <immintrin.h> 3064 /// 3065 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction. 3066 /// 3067 /// \param __X 3068 /// A 128-bit integer vector to be broadcast. 3069 /// \returns A 256-bit integer vector containing the result. 3070 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3071 _mm256_broadcastsi128_si256(__m128i __X) 3072 { 3073 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); 3074 } 3075 3076 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) 3077 3078 /// Merges 32-bit integer elements from either of the two 128-bit vectors of 3079 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32], 3080 /// as specified by the immediate integer operand \a M. 3081 /// 3082 /// \code{.operation} 3083 /// FOR i := 0 TO 3 3084 /// j := i*32 3085 /// IF M[i] == 0 3086 /// result[31+j:j] := V1[31+j:j] 3087 /// ELSE 3088 /// result[31+j:j] := V2[32+j:j] 3089 /// FI 3090 /// ENDFOR 3091 /// \endcode 3092 /// 3093 /// \headerfile <immintrin.h> 3094 /// 3095 /// \code 3096 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M); 3097 /// \endcode 3098 /// 3099 /// This intrinsic corresponds to the \c VPBLENDDD instruction. 3100 /// 3101 /// \param V1 3102 /// A 128-bit vector of [4 x i32] containing source values. 3103 /// \param V2 3104 /// A 128-bit vector of [4 x i32] containing source values. 3105 /// \param M 3106 /// An immediate 8-bit integer operand, with bits [3:0] specifying the 3107 /// source for each element of the result. The position of the mask bit 3108 /// corresponds to the index of a copied value. When a mask bit is 0, the 3109 /// element is copied from \a V1; otherwise, it is copied from \a V2. 3110 /// \returns A 128-bit vector of [4 x i32] containing the result. 3111 #define _mm_blend_epi32(V1, V2, M) \ 3112 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ 3113 (__v4si)(__m128i)(V2), (int)(M))) 3114 3115 /// Merges 32-bit integer elements from either of the two 256-bit vectors of 3116 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32], 3117 /// as specified by the immediate integer operand \a M. 3118 /// 3119 /// \code{.operation} 3120 /// FOR i := 0 TO 7 3121 /// j := i*32 3122 /// IF M[i] == 0 3123 /// result[31+j:j] := V1[31+j:j] 3124 /// ELSE 3125 /// result[31+j:j] := V2[32+j:j] 3126 /// FI 3127 /// ENDFOR 3128 /// \endcode 3129 /// 3130 /// \headerfile <immintrin.h> 3131 /// 3132 /// \code 3133 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M); 3134 /// \endcode 3135 /// 3136 /// This intrinsic corresponds to the \c VPBLENDDD instruction. 3137 /// 3138 /// \param V1 3139 /// A 256-bit vector of [8 x i32] containing source values. 3140 /// \param V2 3141 /// A 256-bit vector of [8 x i32] containing source values. 3142 /// \param M 3143 /// An immediate 8-bit integer operand, with bits [7:0] specifying the 3144 /// source for each element of the result. The position of the mask bit 3145 /// corresponds to the index of a copied value. When a mask bit is 0, the 3146 /// element is copied from \a V1; otherwise, it is is copied from \a V2. 3147 /// \returns A 256-bit vector of [8 x i32] containing the result. 3148 #define _mm256_blend_epi32(V1, V2, M) \ 3149 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ 3150 (__v8si)(__m256i)(V2), (int)(M))) 3151 3152 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3153 /// bytes of the 256-bit result. 3154 /// 3155 /// \headerfile <immintrin.h> 3156 /// 3157 /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3158 /// 3159 /// \param __X 3160 /// A 128-bit integer vector whose low byte will be broadcast. 3161 /// \returns A 256-bit integer vector containing the result. 3162 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3163 _mm256_broadcastb_epi8(__m128i __X) 3164 { 3165 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3166 } 3167 3168 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X 3169 /// to all elements of the result's 256-bit vector of [16 x i16]. 3170 /// 3171 /// \headerfile <immintrin.h> 3172 /// 3173 /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3174 /// 3175 /// \param __X 3176 /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3177 /// \returns A 256-bit vector of [16 x i16] containing the result. 3178 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3179 _mm256_broadcastw_epi16(__m128i __X) 3180 { 3181 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3182 } 3183 3184 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3185 /// to all elements of the result's 256-bit vector of [8 x i32]. 3186 /// 3187 /// \headerfile <immintrin.h> 3188 /// 3189 /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3190 /// 3191 /// \param __X 3192 /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3193 /// \returns A 256-bit vector of [8 x i32] containing the result. 3194 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3195 _mm256_broadcastd_epi32(__m128i __X) 3196 { 3197 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3198 } 3199 3200 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3201 /// to all elements of the result's 256-bit vector of [4 x i64]. 3202 /// 3203 /// \headerfile <immintrin.h> 3204 /// 3205 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3206 /// 3207 /// \param __X 3208 /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3209 /// \returns A 256-bit vector of [4 x i64] containing the result. 3210 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3211 _mm256_broadcastq_epi64(__m128i __X) 3212 { 3213 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); 3214 } 3215 3216 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3217 /// bytes of the 128-bit result. 3218 /// 3219 /// \headerfile <immintrin.h> 3220 /// 3221 /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3222 /// 3223 /// \param __X 3224 /// A 128-bit integer vector whose low byte will be broadcast. 3225 /// \returns A 128-bit integer vector containing the result. 3226 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3227 _mm_broadcastb_epi8(__m128i __X) 3228 { 3229 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3230 } 3231 3232 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in 3233 /// \a __X to all elements of the result's 128-bit vector of [8 x i16]. 3234 /// 3235 /// \headerfile <immintrin.h> 3236 /// 3237 /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3238 /// 3239 /// \param __X 3240 /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3241 /// \returns A 128-bit vector of [8 x i16] containing the result. 3242 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3243 _mm_broadcastw_epi16(__m128i __X) 3244 { 3245 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3246 } 3247 3248 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3249 /// to all elements of the result's vector of [4 x i32]. 3250 /// 3251 /// \headerfile <immintrin.h> 3252 /// 3253 /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3254 /// 3255 /// \param __X 3256 /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3257 /// \returns A 128-bit vector of [4 x i32] containing the result. 3258 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3259 _mm_broadcastd_epi32(__m128i __X) 3260 { 3261 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); 3262 } 3263 3264 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3265 /// to both elements of the result's 128-bit vector of [2 x i64]. 3266 /// 3267 /// \headerfile <immintrin.h> 3268 /// 3269 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3270 /// 3271 /// \param __X 3272 /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3273 /// \returns A 128-bit vector of [2 x i64] containing the result. 3274 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3275 _mm_broadcastq_epi64(__m128i __X) 3276 { 3277 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); 3278 } 3279 3280 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 3281 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the 3282 /// elements of the 256-bit vector of [8 x i32] in \a __b. 3283 /// 3284 /// \code{.operation} 3285 /// FOR i := 0 TO 7 3286 /// j := i*32 3287 /// k := __b[j+2:j] * 32 3288 /// result[j+31:j] := __a[k+31:k] 3289 /// ENDFOR 3290 /// \endcode 3291 /// 3292 /// \headerfile <immintrin.h> 3293 /// 3294 /// This intrinsic corresponds to the \c VPERMD instruction. 3295 /// 3296 /// \param __a 3297 /// A 256-bit vector of [8 x i32] containing the source values. 3298 /// \param __b 3299 /// A 256-bit vector of [8 x i32] containing indexes of values to use from 3300 /// \a __a. 3301 /// \returns A 256-bit vector of [8 x i32] containing the result. 3302 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3303 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 3304 { 3305 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 3306 } 3307 3308 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of 3309 /// the 256-bit vector of [4 x double] in \a V as specified by the 3310 /// immediate value \a M. 3311 /// 3312 /// \code{.operation} 3313 /// FOR i := 0 TO 3 3314 /// j := i*64 3315 /// k := (M >> i*2)[1:0] * 64 3316 /// result[j+63:j] := V[k+63:k] 3317 /// ENDFOR 3318 /// \endcode 3319 /// 3320 /// \headerfile <immintrin.h> 3321 /// 3322 /// \code 3323 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M); 3324 /// \endcode 3325 /// 3326 /// This intrinsic corresponds to the \c VPERMPD instruction. 3327 /// 3328 /// \param V 3329 /// A 256-bit vector of [4 x double] containing the source values. 3330 /// \param M 3331 /// An immediate 8-bit value specifying which elements to copy from \a V. 3332 /// \a M[1:0] specifies the index in \a a for element 0 of the result, 3333 /// \a M[3:2] specifies the index for element 1, and so forth. 3334 /// \returns A 256-bit vector of [4 x double] containing the result. 3335 #define _mm256_permute4x64_pd(V, M) \ 3336 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) 3337 3338 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of 3339 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in 3340 /// the elements of the 256-bit vector of [8 x i32] in \a __b. 3341 /// 3342 /// \code{.operation} 3343 /// FOR i := 0 TO 7 3344 /// j := i*32 3345 /// k := __b[j+2:j] * 32 3346 /// result[j+31:j] := __a[k+31:k] 3347 /// ENDFOR 3348 /// \endcode 3349 /// 3350 /// \headerfile <immintrin.h> 3351 /// 3352 /// This intrinsic corresponds to the \c VPERMPS instruction. 3353 /// 3354 /// \param __a 3355 /// A 256-bit vector of [8 x float] containing the source values. 3356 /// \param __b 3357 /// A 256-bit vector of [8 x i32] containing indexes of values to use from 3358 /// \a __a. 3359 /// \returns A 256-bit vector of [8 x float] containing the result. 3360 static __inline__ __m256 __DEFAULT_FN_ATTRS256 3361 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) 3362 { 3363 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); 3364 } 3365 3366 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements 3367 /// of the 256-bit vector of [4 x i64] in \a V as specified by the 3368 /// immediate value \a M. 3369 /// 3370 /// \code{.operation} 3371 /// FOR i := 0 TO 3 3372 /// j := i*64 3373 /// k := (M >> i*2)[1:0] * 64 3374 /// result[j+63:j] := V[k+63:k] 3375 /// ENDFOR 3376 /// \endcode 3377 /// 3378 /// \headerfile <immintrin.h> 3379 /// 3380 /// \code 3381 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M); 3382 /// \endcode 3383 /// 3384 /// This intrinsic corresponds to the \c VPERMQ instruction. 3385 /// 3386 /// \param V 3387 /// A 256-bit vector of [4 x i64] containing the source values. 3388 /// \param M 3389 /// An immediate 8-bit value specifying which elements to copy from \a V. 3390 /// \a M[1:0] specifies the index in \a a for element 0 of the result, 3391 /// \a M[3:2] specifies the index for element 1, and so forth. 3392 /// \returns A 256-bit vector of [4 x i64] containing the result. 3393 #define _mm256_permute4x64_epi64(V, M) \ 3394 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) 3395 3396 /// Sets each half of the 256-bit result either to zero or to one of the 3397 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2, 3398 /// as specified by the immediate value \a M. 3399 /// 3400 /// \code{.operation} 3401 /// FOR i := 0 TO 1 3402 /// j := i*128 3403 /// k := M >> (i*4) 3404 /// IF k[3] == 0 3405 /// CASE (k[1:0]) OF 3406 /// 0: result[127+j:j] := V1[127:0] 3407 /// 1: result[127+j:j] := V1[255:128] 3408 /// 2: result[127+j:j] := V2[127:0] 3409 /// 3: result[127+j:j] := V2[255:128] 3410 /// ESAC 3411 /// ELSE 3412 /// result[127+j:j] := 0 3413 /// FI 3414 /// ENDFOR 3415 /// \endcode 3416 /// 3417 /// \headerfile <immintrin.h> 3418 /// 3419 /// \code 3420 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M); 3421 /// \endcode 3422 /// 3423 /// This intrinsic corresponds to the \c VPERM2I128 instruction. 3424 /// 3425 /// \param V1 3426 /// A 256-bit integer vector containing source values. 3427 /// \param V2 3428 /// A 256-bit integer vector containing source values. 3429 /// \param M 3430 /// An immediate value specifying how to form the result. Bits [3:0] 3431 /// control the lower half of the result, bits [7:4] control the upper half. 3432 /// Within each 4-bit control value, if bit 3 is 1, the result is zero, 3433 /// otherwise bits [1:0] determine the source as follows. \n 3434 /// 0: the lower half of \a V1 \n 3435 /// 1: the upper half of \a V1 \n 3436 /// 2: the lower half of \a V2 \n 3437 /// 3: the upper half of \a V2 3438 /// \returns A 256-bit integer vector containing the result. 3439 #define _mm256_permute2x128_si256(V1, V2, M) \ 3440 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) 3441 3442 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0 3443 /// of the immediate \a M is zero, extracts the lower half of the result; 3444 /// otherwise, extracts the upper half. 3445 /// 3446 /// \headerfile <immintrin.h> 3447 /// 3448 /// \code 3449 /// __m128i _mm256_extracti128_si256(__m256i V, const int M); 3450 /// \endcode 3451 /// 3452 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction. 3453 /// 3454 /// \param V 3455 /// A 256-bit integer vector containing the source values. 3456 /// \param M 3457 /// An immediate value specifying which half of \a V to extract. 3458 /// \returns A 128-bit integer vector containing the result. 3459 #define _mm256_extracti128_si256(V, M) \ 3460 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) 3461 3462 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the 3463 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M 3464 /// is zero, overwrites the lower half of the result; otherwise, 3465 /// overwrites the upper half. 3466 /// 3467 /// \headerfile <immintrin.h> 3468 /// 3469 /// \code 3470 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M); 3471 /// \endcode 3472 /// 3473 /// This intrinsic corresponds to the \c VINSERTI128 instruction. 3474 /// 3475 /// \param V1 3476 /// A 256-bit integer vector containing a source value. 3477 /// \param V2 3478 /// A 128-bit integer vector containing a source value. 3479 /// \param M 3480 /// An immediate value specifying where to put \a V2 in the result. 3481 /// \returns A 256-bit integer vector containing the result. 3482 #define _mm256_inserti128_si256(V1, V2, M) \ 3483 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ 3484 (__v2di)(__m128i)(V2), (int)(M))) 3485 3486 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if 3487 /// the most significant bit of the corresponding element in the mask 3488 /// \a __M is set; otherwise, sets that element of the result to zero. 3489 /// Returns the 256-bit [8 x i32] result. 3490 /// 3491 /// \code{.operation} 3492 /// FOR i := 0 TO 7 3493 /// j := i*32 3494 /// IF __M[j+31] == 1 3495 /// result[j+31:j] := Load32(__X+(i*4)) 3496 /// ELSE 3497 /// result[j+31:j] := 0 3498 /// FI 3499 /// ENDFOR 3500 /// \endcode 3501 /// 3502 /// \headerfile <immintrin.h> 3503 /// 3504 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3505 /// 3506 /// \param __X 3507 /// A pointer to the memory used for loading values. 3508 /// \param __M 3509 /// A 256-bit vector of [8 x i32] containing the mask bits. 3510 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed 3511 /// elements. 3512 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3513 _mm256_maskload_epi32(int const *__X, __m256i __M) 3514 { 3515 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 3516 } 3517 3518 /// Conditionally loads four 64-bit integer elements from memory \a __X, if 3519 /// the most significant bit of the corresponding element in the mask 3520 /// \a __M is set; otherwise, sets that element of the result to zero. 3521 /// Returns the 256-bit [4 x i64] result. 3522 /// 3523 /// \code{.operation} 3524 /// FOR i := 0 TO 3 3525 /// j := i*64 3526 /// IF __M[j+63] == 1 3527 /// result[j+63:j] := Load64(__X+(i*8)) 3528 /// ELSE 3529 /// result[j+63:j] := 0 3530 /// FI 3531 /// ENDFOR 3532 /// \endcode 3533 /// 3534 /// \headerfile <immintrin.h> 3535 /// 3536 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3537 /// 3538 /// \param __X 3539 /// A pointer to the memory used for loading values. 3540 /// \param __M 3541 /// A 256-bit vector of [4 x i64] containing the mask bits. 3542 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed 3543 /// elements. 3544 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3545 _mm256_maskload_epi64(long long const *__X, __m256i __M) 3546 { 3547 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); 3548 } 3549 3550 /// Conditionally loads four 32-bit integer elements from memory \a __X, if 3551 /// the most significant bit of the corresponding element in the mask 3552 /// \a __M is set; otherwise, sets that element of the result to zero. 3553 /// Returns the 128-bit [4 x i32] result. 3554 /// 3555 /// \code{.operation} 3556 /// FOR i := 0 TO 3 3557 /// j := i*32 3558 /// IF __M[j+31] == 1 3559 /// result[j+31:j] := Load32(__X+(i*4)) 3560 /// ELSE 3561 /// result[j+31:j] := 0 3562 /// FI 3563 /// ENDFOR 3564 /// \endcode 3565 /// 3566 /// \headerfile <immintrin.h> 3567 /// 3568 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3569 /// 3570 /// \param __X 3571 /// A pointer to the memory used for loading values. 3572 /// \param __M 3573 /// A 128-bit vector of [4 x i32] containing the mask bits. 3574 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed 3575 /// elements. 3576 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3577 _mm_maskload_epi32(int const *__X, __m128i __M) 3578 { 3579 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 3580 } 3581 3582 /// Conditionally loads two 64-bit integer elements from memory \a __X, if 3583 /// the most significant bit of the corresponding element in the mask 3584 /// \a __M is set; otherwise, sets that element of the result to zero. 3585 /// Returns the 128-bit [2 x i64] result. 3586 /// 3587 /// \code{.operation} 3588 /// FOR i := 0 TO 1 3589 /// j := i*64 3590 /// IF __M[j+63] == 1 3591 /// result[j+63:j] := Load64(__X+(i*8)) 3592 /// ELSE 3593 /// result[j+63:j] := 0 3594 /// FI 3595 /// ENDFOR 3596 /// \endcode 3597 /// 3598 /// \headerfile <immintrin.h> 3599 /// 3600 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3601 /// 3602 /// \param __X 3603 /// A pointer to the memory used for loading values. 3604 /// \param __M 3605 /// A 128-bit vector of [2 x i64] containing the mask bits. 3606 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed 3607 /// elements. 3608 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3609 _mm_maskload_epi64(long long const *__X, __m128i __M) 3610 { 3611 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 3612 } 3613 3614 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector 3615 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of 3616 /// the corresponding element in the mask \a __M is set; otherwise, the 3617 /// memory element is unchanged. 3618 /// 3619 /// \code{.operation} 3620 /// FOR i := 0 TO 7 3621 /// j := i*32 3622 /// IF __M[j+31] == 1 3623 /// Store32(__X+(i*4), __Y[j+31:j]) 3624 /// FI 3625 /// ENDFOR 3626 /// \endcode 3627 /// 3628 /// \headerfile <immintrin.h> 3629 /// 3630 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3631 /// 3632 /// \param __X 3633 /// A pointer to the memory used for storing values. 3634 /// \param __M 3635 /// A 256-bit vector of [8 x i32] containing the mask bits. 3636 /// \param __Y 3637 /// A 256-bit vector of [8 x i32] containing the values to store. 3638 static __inline__ void __DEFAULT_FN_ATTRS256 3639 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 3640 { 3641 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 3642 } 3643 3644 /// Conditionally stores four 64-bit integer elements from the 256-bit vector 3645 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of 3646 /// the corresponding element in the mask \a __M is set; otherwise, the 3647 /// memory element is unchanged. 3648 /// 3649 /// \code{.operation} 3650 /// FOR i := 0 TO 3 3651 /// j := i*64 3652 /// IF __M[j+63] == 1 3653 /// Store64(__X+(i*8), __Y[j+63:j]) 3654 /// FI 3655 /// ENDFOR 3656 /// \endcode 3657 /// 3658 /// \headerfile <immintrin.h> 3659 /// 3660 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3661 /// 3662 /// \param __X 3663 /// A pointer to the memory used for storing values. 3664 /// \param __M 3665 /// A 256-bit vector of [4 x i64] containing the mask bits. 3666 /// \param __Y 3667 /// A 256-bit vector of [4 x i64] containing the values to store. 3668 static __inline__ void __DEFAULT_FN_ATTRS256 3669 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 3670 { 3671 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 3672 } 3673 3674 /// Conditionally stores four 32-bit integer elements from the 128-bit vector 3675 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of 3676 /// the corresponding element in the mask \a __M is set; otherwise, the 3677 /// memory element is unchanged. 3678 /// 3679 /// \code{.operation} 3680 /// FOR i := 0 TO 3 3681 /// j := i*32 3682 /// IF __M[j+31] == 1 3683 /// Store32(__X+(i*4), __Y[j+31:j]) 3684 /// FI 3685 /// ENDFOR 3686 /// \endcode 3687 /// 3688 /// \headerfile <immintrin.h> 3689 /// 3690 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3691 /// 3692 /// \param __X 3693 /// A pointer to the memory used for storing values. 3694 /// \param __M 3695 /// A 128-bit vector of [4 x i32] containing the mask bits. 3696 /// \param __Y 3697 /// A 128-bit vector of [4 x i32] containing the values to store. 3698 static __inline__ void __DEFAULT_FN_ATTRS128 3699 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 3700 { 3701 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 3702 } 3703 3704 /// Conditionally stores two 64-bit integer elements from the 128-bit vector 3705 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of 3706 /// the corresponding element in the mask \a __M is set; otherwise, the 3707 /// memory element is unchanged. 3708 /// 3709 /// \code{.operation} 3710 /// FOR i := 0 TO 1 3711 /// j := i*64 3712 /// IF __M[j+63] == 1 3713 /// Store64(__X+(i*8), __Y[j+63:j]) 3714 /// FI 3715 /// ENDFOR 3716 /// \endcode 3717 /// 3718 /// \headerfile <immintrin.h> 3719 /// 3720 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3721 /// 3722 /// \param __X 3723 /// A pointer to the memory used for storing values. 3724 /// \param __M 3725 /// A 128-bit vector of [2 x i64] containing the mask bits. 3726 /// \param __Y 3727 /// A 128-bit vector of [2 x i64] containing the values to store. 3728 static __inline__ void __DEFAULT_FN_ATTRS128 3729 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 3730 { 3731 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 3732 } 3733 3734 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3735 /// left by the number of bits given in the corresponding element of the 3736 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3737 /// returns the result. If the shift count for any element is greater than 3738 /// 31, the result for that element is zero. 3739 /// 3740 /// \headerfile <immintrin.h> 3741 /// 3742 /// This intrinsic corresponds to the \c VPSLLVD instruction. 3743 /// 3744 /// \param __X 3745 /// A 256-bit vector of [8 x i32] to be shifted. 3746 /// \param __Y 3747 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3748 /// bits). 3749 /// \returns A 256-bit vector of [8 x i32] containing the result. 3750 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3751 _mm256_sllv_epi32(__m256i __X, __m256i __Y) 3752 { 3753 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 3754 } 3755 3756 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3757 /// left by the number of bits given in the corresponding element of the 3758 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3759 /// returns the result. If the shift count for any element is greater than 3760 /// 31, the result for that element is zero. 3761 /// 3762 /// \headerfile <immintrin.h> 3763 /// 3764 /// This intrinsic corresponds to the \c VPSLLVD instruction. 3765 /// 3766 /// \param __X 3767 /// A 128-bit vector of [4 x i32] to be shifted. 3768 /// \param __Y 3769 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3770 /// bits). 3771 /// \returns A 128-bit vector of [4 x i32] containing the result. 3772 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3773 _mm_sllv_epi32(__m128i __X, __m128i __Y) 3774 { 3775 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 3776 } 3777 3778 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3779 /// left by the number of bits given in the corresponding element of the 3780 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3781 /// returns the result. If the shift count for any element is greater than 3782 /// 63, the result for that element is zero. 3783 /// 3784 /// \headerfile <immintrin.h> 3785 /// 3786 /// This intrinsic corresponds to the \c VPSLLVQ instruction. 3787 /// 3788 /// \param __X 3789 /// A 256-bit vector of [4 x i64] to be shifted. 3790 /// \param __Y 3791 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3792 /// bits). 3793 /// \returns A 256-bit vector of [4 x i64] containing the result. 3794 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3795 _mm256_sllv_epi64(__m256i __X, __m256i __Y) 3796 { 3797 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); 3798 } 3799 3800 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3801 /// left by the number of bits given in the corresponding element of the 3802 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3803 /// returns the result. If the shift count for any element is greater than 3804 /// 63, the result for that element is zero. 3805 /// 3806 /// \headerfile <immintrin.h> 3807 /// 3808 /// This intrinsic corresponds to the \c VPSLLVQ instruction. 3809 /// 3810 /// \param __X 3811 /// A 128-bit vector of [2 x i64] to be shifted. 3812 /// \param __Y 3813 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3814 /// bits). 3815 /// \returns A 128-bit vector of [2 x i64] containing the result. 3816 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3817 _mm_sllv_epi64(__m128i __X, __m128i __Y) 3818 { 3819 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); 3820 } 3821 3822 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3823 /// right by the number of bits given in the corresponding element of the 3824 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and 3825 /// returns the result. If the shift count for any element is greater than 3826 /// 31, the result for that element is 0 or -1 according to the sign bit 3827 /// for that element. 3828 /// 3829 /// \headerfile <immintrin.h> 3830 /// 3831 /// This intrinsic corresponds to the \c VPSRAVD instruction. 3832 /// 3833 /// \param __X 3834 /// A 256-bit vector of [8 x i32] to be shifted. 3835 /// \param __Y 3836 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3837 /// bits). 3838 /// \returns A 256-bit vector of [8 x i32] containing the result. 3839 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3840 _mm256_srav_epi32(__m256i __X, __m256i __Y) 3841 { 3842 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 3843 } 3844 3845 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3846 /// right by the number of bits given in the corresponding element of the 3847 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and 3848 /// returns the result. If the shift count for any element is greater than 3849 /// 31, the result for that element is 0 or -1 according to the sign bit 3850 /// for that element. 3851 /// 3852 /// \headerfile <immintrin.h> 3853 /// 3854 /// This intrinsic corresponds to the \c VPSRAVD instruction. 3855 /// 3856 /// \param __X 3857 /// A 128-bit vector of [4 x i32] to be shifted. 3858 /// \param __Y 3859 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3860 /// bits). 3861 /// \returns A 128-bit vector of [4 x i32] containing the result. 3862 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3863 _mm_srav_epi32(__m128i __X, __m128i __Y) 3864 { 3865 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 3866 } 3867 3868 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3869 /// right by the number of bits given in the corresponding element of the 3870 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3871 /// returns the result. If the shift count for any element is greater than 3872 /// 31, the result for that element is zero. 3873 /// 3874 /// \headerfile <immintrin.h> 3875 /// 3876 /// This intrinsic corresponds to the \c VPSRLVD instruction. 3877 /// 3878 /// \param __X 3879 /// A 256-bit vector of [8 x i32] to be shifted. 3880 /// \param __Y 3881 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3882 /// bits). 3883 /// \returns A 256-bit vector of [8 x i32] containing the result. 3884 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3885 _mm256_srlv_epi32(__m256i __X, __m256i __Y) 3886 { 3887 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 3888 } 3889 3890 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3891 /// right by the number of bits given in the corresponding element of the 3892 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3893 /// returns the result. If the shift count for any element is greater than 3894 /// 31, the result for that element is zero. 3895 /// 3896 /// \headerfile <immintrin.h> 3897 /// 3898 /// This intrinsic corresponds to the \c VPSRLVD instruction. 3899 /// 3900 /// \param __X 3901 /// A 128-bit vector of [4 x i32] to be shifted. 3902 /// \param __Y 3903 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3904 /// bits). 3905 /// \returns A 128-bit vector of [4 x i32] containing the result. 3906 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3907 _mm_srlv_epi32(__m128i __X, __m128i __Y) 3908 { 3909 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 3910 } 3911 3912 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3913 /// right by the number of bits given in the corresponding element of the 3914 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3915 /// returns the result. If the shift count for any element is greater than 3916 /// 63, the result for that element is zero. 3917 /// 3918 /// \headerfile <immintrin.h> 3919 /// 3920 /// This intrinsic corresponds to the \c VPSRLVQ instruction. 3921 /// 3922 /// \param __X 3923 /// A 256-bit vector of [4 x i64] to be shifted. 3924 /// \param __Y 3925 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3926 /// bits). 3927 /// \returns A 256-bit vector of [4 x i64] containing the result. 3928 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3929 _mm256_srlv_epi64(__m256i __X, __m256i __Y) 3930 { 3931 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); 3932 } 3933 3934 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3935 /// right by the number of bits given in the corresponding element of the 3936 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3937 /// returns the result. If the shift count for any element is greater than 3938 /// 63, the result for that element is zero. 3939 /// 3940 /// \headerfile <immintrin.h> 3941 /// 3942 /// This intrinsic corresponds to the \c VPSRLVQ instruction. 3943 /// 3944 /// \param __X 3945 /// A 128-bit vector of [2 x i64] to be shifted. 3946 /// \param __Y 3947 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3948 /// bits). 3949 /// \returns A 128-bit vector of [2 x i64] containing the result. 3950 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3951 _mm_srlv_epi64(__m128i __X, __m128i __Y) 3952 { 3953 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); 3954 } 3955 3956 /// Conditionally gathers two 64-bit floating-point values, either from the 3957 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 3958 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 3959 /// of [2 x double] in \a mask determines the source for each element. 3960 /// 3961 /// \code{.operation} 3962 /// FOR element := 0 to 1 3963 /// j := element*64 3964 /// k := element*32 3965 /// IF mask[j+63] == 0 3966 /// result[j+63:j] := a[j+63:j] 3967 /// ELSE 3968 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 3969 /// FI 3970 /// ENDFOR 3971 /// \endcode 3972 /// 3973 /// \headerfile <immintrin.h> 3974 /// 3975 /// \code 3976 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i, 3977 /// __m128d mask, const int s); 3978 /// \endcode 3979 /// 3980 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 3981 /// 3982 /// \param a 3983 /// A 128-bit vector of [2 x double] used as the source when a mask bit is 3984 /// zero. 3985 /// \param m 3986 /// A pointer to the memory used for loading values. 3987 /// \param i 3988 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 3989 /// the first two elements are used. 3990 /// \param mask 3991 /// A 128-bit vector of [2 x double] containing the mask. The most 3992 /// significant bit of each element in the mask vector represents the mask 3993 /// bits. If a mask bit is zero, the corresponding value from vector \a a 3994 /// is gathered; otherwise the value is loaded from memory. 3995 /// \param s 3996 /// A literal constant scale factor for the indexes in \a i. Must be 3997 /// 1, 2, 4, or 8. 3998 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 3999 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \ 4000 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ 4001 (double const *)(m), \ 4002 (__v4si)(__m128i)(i), \ 4003 (__v2df)(__m128d)(mask), (s))) 4004 4005 /// Conditionally gathers four 64-bit floating-point values, either from the 4006 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4007 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4008 /// of [4 x double] in \a mask determines the source for each element. 4009 /// 4010 /// \code{.operation} 4011 /// FOR element := 0 to 3 4012 /// j := element*64 4013 /// k := element*32 4014 /// IF mask[j+63] == 0 4015 /// result[j+63:j] := a[j+63:j] 4016 /// ELSE 4017 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4018 /// FI 4019 /// ENDFOR 4020 /// \endcode 4021 /// 4022 /// \headerfile <immintrin.h> 4023 /// 4024 /// \code 4025 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i, 4026 /// __m256d mask, const int s); 4027 /// \endcode 4028 /// 4029 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4030 /// 4031 /// \param a 4032 /// A 256-bit vector of [4 x double] used as the source when a mask bit is 4033 /// zero. 4034 /// \param m 4035 /// A pointer to the memory used for loading values. 4036 /// \param i 4037 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4038 /// \param mask 4039 /// A 256-bit vector of [4 x double] containing the mask. The most 4040 /// significant bit of each element in the mask vector represents the mask 4041 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4042 /// is gathered; otherwise the value is loaded from memory. 4043 /// \param s 4044 /// A literal constant scale factor for the indexes in \a i. Must be 4045 /// 1, 2, 4, or 8. 4046 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4047 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ 4048 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ 4049 (double const *)(m), \ 4050 (__v4si)(__m128i)(i), \ 4051 (__v4df)(__m256d)(mask), (s))) 4052 4053 /// Conditionally gathers two 64-bit floating-point values, either from the 4054 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 4055 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4056 /// of [2 x double] in \a mask determines the source for each element. 4057 /// 4058 /// \code{.operation} 4059 /// FOR element := 0 to 1 4060 /// j := element*64 4061 /// k := element*64 4062 /// IF mask[j+63] == 0 4063 /// result[j+63:j] := a[j+63:j] 4064 /// ELSE 4065 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4066 /// FI 4067 /// ENDFOR 4068 /// \endcode 4069 /// 4070 /// \headerfile <immintrin.h> 4071 /// 4072 /// \code 4073 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i, 4074 /// __m128d mask, const int s); 4075 /// \endcode 4076 /// 4077 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4078 /// 4079 /// \param a 4080 /// A 128-bit vector of [2 x double] used as the source when a mask bit is 4081 /// zero. 4082 /// \param m 4083 /// A pointer to the memory used for loading values. 4084 /// \param i 4085 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4086 /// \param mask 4087 /// A 128-bit vector of [2 x double] containing the mask. The most 4088 /// significant bit of each element in the mask vector represents the mask 4089 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4090 /// is gathered; otherwise the value is loaded from memory. 4091 /// \param s 4092 /// A literal constant scale factor for the indexes in \a i. Must be 4093 /// 1, 2, 4, or 8. 4094 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4095 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \ 4096 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ 4097 (double const *)(m), \ 4098 (__v2di)(__m128i)(i), \ 4099 (__v2df)(__m128d)(mask), (s))) 4100 4101 /// Conditionally gathers four 64-bit floating-point values, either from the 4102 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4103 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4104 /// of [4 x double] in \a mask determines the source for each element. 4105 /// 4106 /// \code{.operation} 4107 /// FOR element := 0 to 3 4108 /// j := element*64 4109 /// k := element*64 4110 /// IF mask[j+63] == 0 4111 /// result[j+63:j] := a[j+63:j] 4112 /// ELSE 4113 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4114 /// FI 4115 /// ENDFOR 4116 /// \endcode 4117 /// 4118 /// \headerfile <immintrin.h> 4119 /// 4120 /// \code 4121 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i, 4122 /// __m256d mask, const int s); 4123 /// \endcode 4124 /// 4125 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4126 /// 4127 /// \param a 4128 /// A 256-bit vector of [4 x double] used as the source when a mask bit is 4129 /// zero. 4130 /// \param m 4131 /// A pointer to the memory used for loading values. 4132 /// \param i 4133 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4134 /// \param mask 4135 /// A 256-bit vector of [4 x double] containing the mask. The most 4136 /// significant bit of each element in the mask vector represents the mask 4137 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4138 /// is gathered; otherwise the value is loaded from memory. 4139 /// \param s 4140 /// A literal constant scale factor for the indexes in \a i. Must be 4141 /// 1, 2, 4, or 8. 4142 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4143 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ 4144 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ 4145 (double const *)(m), \ 4146 (__v4di)(__m256i)(i), \ 4147 (__v4df)(__m256d)(mask), (s))) 4148 4149 /// Conditionally gathers four 32-bit floating-point values, either from the 4150 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4151 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4152 /// of [4 x float] in \a mask determines the source for each element. 4153 /// 4154 /// \code{.operation} 4155 /// FOR element := 0 to 3 4156 /// j := element*32 4157 /// k := element*32 4158 /// IF mask[j+31] == 0 4159 /// result[j+31:j] := a[j+31:j] 4160 /// ELSE 4161 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4162 /// FI 4163 /// ENDFOR 4164 /// \endcode 4165 /// 4166 /// \headerfile <immintrin.h> 4167 /// 4168 /// \code 4169 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i, 4170 /// __m128 mask, const int s); 4171 /// \endcode 4172 /// 4173 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4174 /// 4175 /// \param a 4176 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4177 /// zero. 4178 /// \param m 4179 /// A pointer to the memory used for loading values. 4180 /// \param i 4181 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4182 /// \param mask 4183 /// A 128-bit vector of [4 x float] containing the mask. The most 4184 /// significant bit of each element in the mask vector represents the mask 4185 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4186 /// is gathered; otherwise the value is loaded from memory. 4187 /// \param s 4188 /// A literal constant scale factor for the indexes in \a i. Must be 4189 /// 1, 2, 4, or 8. 4190 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4191 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \ 4192 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ 4193 (float const *)(m), \ 4194 (__v4si)(__m128i)(i), \ 4195 (__v4sf)(__m128)(mask), (s))) 4196 4197 /// Conditionally gathers eight 32-bit floating-point values, either from the 4198 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled 4199 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4200 /// of [8 x float] in \a mask determines the source for each element. 4201 /// 4202 /// \code{.operation} 4203 /// FOR element := 0 to 7 4204 /// j := element*32 4205 /// k := element*32 4206 /// IF mask[j+31] == 0 4207 /// result[j+31:j] := a[j+31:j] 4208 /// ELSE 4209 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4210 /// FI 4211 /// ENDFOR 4212 /// \endcode 4213 /// 4214 /// \headerfile <immintrin.h> 4215 /// 4216 /// \code 4217 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i, 4218 /// __m256 mask, const int s); 4219 /// \endcode 4220 /// 4221 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4222 /// 4223 /// \param a 4224 /// A 256-bit vector of [8 x float] used as the source when a mask bit is 4225 /// zero. 4226 /// \param m 4227 /// A pointer to the memory used for loading values. 4228 /// \param i 4229 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4230 /// \param mask 4231 /// A 256-bit vector of [8 x float] containing the mask. The most 4232 /// significant bit of each element in the mask vector represents the mask 4233 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4234 /// is gathered; otherwise the value is loaded from memory. 4235 /// \param s 4236 /// A literal constant scale factor for the indexes in \a i. Must be 4237 /// 1, 2, 4, or 8. 4238 /// \returns A 256-bit vector of [8 x float] containing the gathered values. 4239 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ 4240 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ 4241 (float const *)(m), \ 4242 (__v8si)(__m256i)(i), \ 4243 (__v8sf)(__m256)(mask), (s))) 4244 4245 /// Conditionally gathers two 32-bit floating-point values, either from the 4246 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4247 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4248 /// of [4 x float] in \a mask determines the source for the lower two 4249 /// elements. The upper two elements of the result are zeroed. 4250 /// 4251 /// \code{.operation} 4252 /// FOR element := 0 to 1 4253 /// j := element*32 4254 /// k := element*64 4255 /// IF mask[j+31] == 0 4256 /// result[j+31:j] := a[j+31:j] 4257 /// ELSE 4258 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4259 /// FI 4260 /// ENDFOR 4261 /// result[127:64] := 0 4262 /// \endcode 4263 /// 4264 /// \headerfile <immintrin.h> 4265 /// 4266 /// \code 4267 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i, 4268 /// __m128 mask, const int s); 4269 /// \endcode 4270 /// 4271 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4272 /// 4273 /// \param a 4274 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4275 /// zero. Only the first two elements are used. 4276 /// \param m 4277 /// A pointer to the memory used for loading values. 4278 /// \param i 4279 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4280 /// \param mask 4281 /// A 128-bit vector of [4 x float] containing the mask. The most 4282 /// significant bit of each element in the mask vector represents the mask 4283 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4284 /// is gathered; otherwise the value is loaded from memory. Only the first 4285 /// two elements are used. 4286 /// \param s 4287 /// A literal constant scale factor for the indexes in \a i. Must be 4288 /// 1, 2, 4, or 8. 4289 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4290 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \ 4291 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ 4292 (float const *)(m), \ 4293 (__v2di)(__m128i)(i), \ 4294 (__v4sf)(__m128)(mask), (s))) 4295 4296 /// Conditionally gathers four 32-bit floating-point values, either from the 4297 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4298 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4299 /// of [4 x float] in \a mask determines the source for each element. 4300 /// 4301 /// \code{.operation} 4302 /// FOR element := 0 to 3 4303 /// j := element*32 4304 /// k := element*64 4305 /// IF mask[j+31] == 0 4306 /// result[j+31:j] := a[j+31:j] 4307 /// ELSE 4308 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4309 /// FI 4310 /// ENDFOR 4311 /// \endcode 4312 /// 4313 /// \headerfile <immintrin.h> 4314 /// 4315 /// \code 4316 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i, 4317 /// __m128 mask, const int s); 4318 /// \endcode 4319 /// 4320 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4321 /// 4322 /// \param a 4323 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4324 /// zero. 4325 /// \param m 4326 /// A pointer to the memory used for loading values. 4327 /// \param i 4328 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4329 /// \param mask 4330 /// A 128-bit vector of [4 x float] containing the mask. The most 4331 /// significant bit of each element in the mask vector represents the mask 4332 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4333 /// is gathered; otherwise the value is loaded from memory. 4334 /// \param s 4335 /// A literal constant scale factor for the indexes in \a i. Must be 4336 /// 1, 2, 4, or 8. 4337 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4338 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ 4339 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ 4340 (float const *)(m), \ 4341 (__v4di)(__m256i)(i), \ 4342 (__v4sf)(__m128)(mask), (s))) 4343 4344 /// Conditionally gathers four 32-bit integer values, either from the 4345 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4346 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4347 /// of [4 x i32] in \a mask determines the source for each element. 4348 /// 4349 /// \code{.operation} 4350 /// FOR element := 0 to 3 4351 /// j := element*32 4352 /// k := element*32 4353 /// IF mask[j+31] == 0 4354 /// result[j+31:j] := a[j+31:j] 4355 /// ELSE 4356 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4357 /// FI 4358 /// ENDFOR 4359 /// \endcode 4360 /// 4361 /// \headerfile <immintrin.h> 4362 /// 4363 /// \code 4364 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i, 4365 /// __m128i mask, const int s); 4366 /// \endcode 4367 /// 4368 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 4369 /// 4370 /// \param a 4371 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4372 /// zero. 4373 /// \param m 4374 /// A pointer to the memory used for loading values. 4375 /// \param i 4376 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4377 /// \param mask 4378 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4379 /// bit of each element in the mask vector represents the mask bits. If a 4380 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4381 /// otherwise the value is loaded from memory. 4382 /// \param s 4383 /// A literal constant scale factor for the indexes in \a i. Must be 4384 /// 1, 2, 4, or 8. 4385 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4386 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ 4387 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ 4388 (int const *)(m), \ 4389 (__v4si)(__m128i)(i), \ 4390 (__v4si)(__m128i)(mask), (s))) 4391 4392 /// Conditionally gathers eight 32-bit integer values, either from the 4393 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled 4394 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4395 /// of [8 x i32] in \a mask determines the source for each element. 4396 /// 4397 /// \code{.operation} 4398 /// FOR element := 0 to 7 4399 /// j := element*32 4400 /// k := element*32 4401 /// IF mask[j+31] == 0 4402 /// result[j+31:j] := a[j+31:j] 4403 /// ELSE 4404 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4405 /// FI 4406 /// ENDFOR 4407 /// \endcode 4408 /// 4409 /// \headerfile <immintrin.h> 4410 /// 4411 /// \code 4412 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i, 4413 /// __m256i mask, const int s); 4414 /// \endcode 4415 /// 4416 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 4417 /// 4418 /// \param a 4419 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is 4420 /// zero. 4421 /// \param m 4422 /// A pointer to the memory used for loading values. 4423 /// \param i 4424 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4425 /// \param mask 4426 /// A 256-bit vector of [8 x i32] containing the mask. The most significant 4427 /// bit of each element in the mask vector represents the mask bits. If a 4428 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4429 /// otherwise the value is loaded from memory. 4430 /// \param s 4431 /// A literal constant scale factor for the indexes in \a i. Must be 4432 /// 1, 2, 4, or 8. 4433 /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 4434 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ 4435 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ 4436 (int const *)(m), \ 4437 (__v8si)(__m256i)(i), \ 4438 (__v8si)(__m256i)(mask), (s))) 4439 4440 /// Conditionally gathers two 32-bit integer values, either from the 4441 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4442 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4443 /// of [4 x i32] in \a mask determines the source for the lower two 4444 /// elements. The upper two elements of the result are zeroed. 4445 /// 4446 /// \code{.operation} 4447 /// FOR element := 0 to 1 4448 /// j := element*32 4449 /// k := element*64 4450 /// IF mask[j+31] == 0 4451 /// result[j+31:j] := a[j+31:j] 4452 /// ELSE 4453 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4454 /// FI 4455 /// ENDFOR 4456 /// result[127:64] := 0 4457 /// \endcode 4458 /// 4459 /// \headerfile <immintrin.h> 4460 /// 4461 /// \code 4462 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i, 4463 /// __m128i mask, const int s); 4464 /// \endcode 4465 /// 4466 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 4467 /// 4468 /// \param a 4469 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4470 /// zero. Only the first two elements are used. 4471 /// \param m 4472 /// A pointer to the memory used for loading values. 4473 /// \param i 4474 /// A 128-bit vector of [2 x i64] containing indexes into \a m. 4475 /// \param mask 4476 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4477 /// bit of each element in the mask vector represents the mask bits. If a 4478 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4479 /// otherwise the value is loaded from memory. Only the first two elements 4480 /// are used. 4481 /// \param s 4482 /// A literal constant scale factor for the indexes in \a i. Must be 4483 /// 1, 2, 4, or 8. 4484 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4485 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ 4486 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ 4487 (int const *)(m), \ 4488 (__v2di)(__m128i)(i), \ 4489 (__v4si)(__m128i)(mask), (s))) 4490 4491 /// Conditionally gathers four 32-bit integer values, either from the 4492 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4493 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4494 /// of [4 x i32] in \a mask determines the source for each element. 4495 /// 4496 /// \code{.operation} 4497 /// FOR element := 0 to 3 4498 /// j := element*32 4499 /// k := element*64 4500 /// IF mask[j+31] == 0 4501 /// result[j+31:j] := a[j+31:j] 4502 /// ELSE 4503 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4504 /// FI 4505 /// ENDFOR 4506 /// \endcode 4507 /// 4508 /// \headerfile <immintrin.h> 4509 /// 4510 /// \code 4511 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i, 4512 /// __m128i mask, const int s); 4513 /// \endcode 4514 /// 4515 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 4516 /// 4517 /// \param a 4518 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4519 /// zero. 4520 /// \param m 4521 /// A pointer to the memory used for loading values. 4522 /// \param i 4523 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4524 /// \param mask 4525 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4526 /// bit of each element in the mask vector represents the mask bits. If a 4527 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4528 /// otherwise the value is loaded from memory. 4529 /// \param s 4530 /// A literal constant scale factor for the indexes in \a i. Must be 4531 /// 1, 2, 4, or 8. 4532 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4533 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ 4534 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ 4535 (int const *)(m), \ 4536 (__v4di)(__m256i)(i), \ 4537 (__v4si)(__m128i)(mask), (s))) 4538 4539 /// Conditionally gathers two 64-bit integer values, either from the 4540 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4541 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4542 /// of [2 x i64] in \a mask determines the source for each element. 4543 /// 4544 /// \code{.operation} 4545 /// FOR element := 0 to 1 4546 /// j := element*64 4547 /// k := element*32 4548 /// IF mask[j+63] == 0 4549 /// result[j+63:j] := a[j+63:j] 4550 /// ELSE 4551 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4552 /// FI 4553 /// ENDFOR 4554 /// \endcode 4555 /// 4556 /// \headerfile <immintrin.h> 4557 /// 4558 /// \code 4559 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i, 4560 /// __m128i mask, const int s); 4561 /// \endcode 4562 /// 4563 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4564 /// 4565 /// \param a 4566 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4567 /// zero. 4568 /// \param m 4569 /// A pointer to the memory used for loading values. 4570 /// \param i 4571 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4572 /// the first two elements are used. 4573 /// \param mask 4574 /// A 128-bit vector of [2 x i64] containing the mask. The most significant 4575 /// bit of each element in the mask vector represents the mask bits. If a 4576 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4577 /// otherwise the value is loaded from memory. 4578 /// \param s 4579 /// A literal constant scale factor for the indexes in \a i. Must be 4580 /// 1, 2, 4, or 8. 4581 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4582 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ 4583 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ 4584 (long long const *)(m), \ 4585 (__v4si)(__m128i)(i), \ 4586 (__v2di)(__m128i)(mask), (s))) 4587 4588 /// Conditionally gathers four 64-bit integer values, either from the 4589 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4590 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4591 /// of [4 x i64] in \a mask determines the source for each element. 4592 /// 4593 /// \code{.operation} 4594 /// FOR element := 0 to 3 4595 /// j := element*64 4596 /// k := element*32 4597 /// IF mask[j+63] == 0 4598 /// result[j+63:j] := a[j+63:j] 4599 /// ELSE 4600 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4601 /// FI 4602 /// ENDFOR 4603 /// \endcode 4604 /// 4605 /// \headerfile <immintrin.h> 4606 /// 4607 /// \code 4608 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m, 4609 /// __m128i i, __m256i mask, const int s); 4610 /// \endcode 4611 /// 4612 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4613 /// 4614 /// \param a 4615 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4616 /// zero. 4617 /// \param m 4618 /// A pointer to the memory used for loading values. 4619 /// \param i 4620 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4621 /// \param mask 4622 /// A 256-bit vector of [4 x i64] containing the mask. The most significant 4623 /// bit of each element in the mask vector represents the mask bits. If a 4624 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4625 /// otherwise the value is loaded from memory. 4626 /// \param s 4627 /// A literal constant scale factor for the indexes in \a i. Must be 4628 /// 1, 2, 4, or 8. 4629 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4630 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ 4631 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ 4632 (long long const *)(m), \ 4633 (__v4si)(__m128i)(i), \ 4634 (__v4di)(__m256i)(mask), (s))) 4635 4636 /// Conditionally gathers two 64-bit integer values, either from the 4637 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4638 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4639 /// of [2 x i64] in \a mask determines the source for each element. 4640 /// 4641 /// \code{.operation} 4642 /// FOR element := 0 to 1 4643 /// j := element*64 4644 /// k := element*64 4645 /// IF mask[j+63] == 0 4646 /// result[j+63:j] := a[j+63:j] 4647 /// ELSE 4648 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4649 /// FI 4650 /// ENDFOR 4651 /// \endcode 4652 /// 4653 /// \headerfile <immintrin.h> 4654 /// 4655 /// \code 4656 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i, 4657 /// __m128i mask, const int s); 4658 /// \endcode 4659 /// 4660 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4661 /// 4662 /// \param a 4663 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4664 /// zero. 4665 /// \param m 4666 /// A pointer to the memory used for loading values. 4667 /// \param i 4668 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4669 /// \param mask 4670 /// A 128-bit vector of [2 x i64] containing the mask. The most significant 4671 /// bit of each element in the mask vector represents the mask bits. If a 4672 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4673 /// otherwise the value is loaded from memory. 4674 /// \param s 4675 /// A literal constant scale factor for the indexes in \a i. Must be 4676 /// 1, 2, 4, or 8. 4677 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4678 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ 4679 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ 4680 (long long const *)(m), \ 4681 (__v2di)(__m128i)(i), \ 4682 (__v2di)(__m128i)(mask), (s))) 4683 4684 /// Conditionally gathers four 64-bit integer values, either from the 4685 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4686 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4687 /// of [4 x i64] in \a mask determines the source for each element. 4688 /// 4689 /// \code{.operation} 4690 /// FOR element := 0 to 3 4691 /// j := element*64 4692 /// k := element*64 4693 /// IF mask[j+63] == 0 4694 /// result[j+63:j] := a[j+63:j] 4695 /// ELSE 4696 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4697 /// FI 4698 /// ENDFOR 4699 /// \endcode 4700 /// 4701 /// \headerfile <immintrin.h> 4702 /// 4703 /// \code 4704 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m, 4705 /// __m256i i, __m256i mask, const int s); 4706 /// \endcode 4707 /// 4708 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4709 /// 4710 /// \param a 4711 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4712 /// zero. 4713 /// \param m 4714 /// A pointer to the memory used for loading values. 4715 /// \param i 4716 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4717 /// \param mask 4718 /// A 256-bit vector of [4 x i64] containing the mask. The most significant 4719 /// bit of each element in the mask vector represents the mask bits. If a 4720 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4721 /// otherwise the value is loaded from memory. 4722 /// \param s 4723 /// A literal constant scale factor for the indexes in \a i. Must be 4724 /// 1, 2, 4, or 8. 4725 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4726 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ 4727 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ 4728 (long long const *)(m), \ 4729 (__v4di)(__m256i)(i), \ 4730 (__v4di)(__m256i)(mask), (s))) 4731 4732 /// Gathers two 64-bit floating-point values from memory \a m using scaled 4733 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4734 /// 4735 /// \code{.operation} 4736 /// FOR element := 0 to 1 4737 /// j := element*64 4738 /// k := element*32 4739 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4740 /// ENDFOR 4741 /// \endcode 4742 /// 4743 /// \headerfile <immintrin.h> 4744 /// 4745 /// \code 4746 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s); 4747 /// \endcode 4748 /// 4749 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4750 /// 4751 /// \param m 4752 /// A pointer to the memory used for loading values. 4753 /// \param i 4754 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4755 /// the first two elements are used. 4756 /// \param s 4757 /// A literal constant scale factor for the indexes in \a i. Must be 4758 /// 1, 2, 4, or 8. 4759 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4760 #define _mm_i32gather_pd(m, i, s) \ 4761 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ 4762 (double const *)(m), \ 4763 (__v4si)(__m128i)(i), \ 4764 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4765 _mm_setzero_pd()), \ 4766 (s))) 4767 4768 /// Gathers four 64-bit floating-point values from memory \a m using scaled 4769 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4770 /// 4771 /// \code{.operation} 4772 /// FOR element := 0 to 3 4773 /// j := element*64 4774 /// k := element*32 4775 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4776 /// ENDFOR 4777 /// \endcode 4778 /// 4779 /// \headerfile <immintrin.h> 4780 /// 4781 /// \code 4782 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s); 4783 /// \endcode 4784 /// 4785 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4786 /// 4787 /// \param m 4788 /// A pointer to the memory used for loading values. 4789 /// \param i 4790 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4791 /// \param s 4792 /// A literal constant scale factor for the indexes in \a i. Must be 4793 /// 1, 2, 4, or 8. 4794 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4795 #define _mm256_i32gather_pd(m, i, s) \ 4796 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ 4797 (double const *)(m), \ 4798 (__v4si)(__m128i)(i), \ 4799 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4800 _mm256_setzero_pd(), \ 4801 _CMP_EQ_OQ), \ 4802 (s))) 4803 4804 /// Gathers two 64-bit floating-point values from memory \a m using scaled 4805 /// indexes from the 128-bit vector of [2 x i64] in \a i. 4806 /// 4807 /// \code{.operation} 4808 /// FOR element := 0 to 1 4809 /// j := element*64 4810 /// k := element*64 4811 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4812 /// ENDFOR 4813 /// \endcode 4814 /// 4815 /// \headerfile <immintrin.h> 4816 /// 4817 /// \code 4818 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s); 4819 /// \endcode 4820 /// 4821 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4822 /// 4823 /// \param m 4824 /// A pointer to the memory used for loading values. 4825 /// \param i 4826 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4827 /// \param s 4828 /// A literal constant scale factor for the indexes in \a i. Must be 4829 /// 1, 2, 4, or 8. 4830 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4831 #define _mm_i64gather_pd(m, i, s) \ 4832 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ 4833 (double const *)(m), \ 4834 (__v2di)(__m128i)(i), \ 4835 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4836 _mm_setzero_pd()), \ 4837 (s))) 4838 4839 /// Gathers four 64-bit floating-point values from memory \a m using scaled 4840 /// indexes from the 256-bit vector of [4 x i64] in \a i. 4841 /// 4842 /// \code{.operation} 4843 /// FOR element := 0 to 3 4844 /// j := element*64 4845 /// k := element*64 4846 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4847 /// ENDFOR 4848 /// \endcode 4849 /// 4850 /// \headerfile <immintrin.h> 4851 /// 4852 /// \code 4853 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s); 4854 /// \endcode 4855 /// 4856 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4857 /// 4858 /// \param m 4859 /// A pointer to the memory used for loading values. 4860 /// \param i 4861 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4862 /// \param s 4863 /// A literal constant scale factor for the indexes in \a i. Must be 4864 /// 1, 2, 4, or 8. 4865 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4866 #define _mm256_i64gather_pd(m, i, s) \ 4867 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ 4868 (double const *)(m), \ 4869 (__v4di)(__m256i)(i), \ 4870 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4871 _mm256_setzero_pd(), \ 4872 _CMP_EQ_OQ), \ 4873 (s))) 4874 4875 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4876 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4877 /// 4878 /// \code{.operation} 4879 /// FOR element := 0 to 3 4880 /// j := element*32 4881 /// k := element*32 4882 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4883 /// ENDFOR 4884 /// \endcode 4885 /// 4886 /// \headerfile <immintrin.h> 4887 /// 4888 /// \code 4889 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s); 4890 /// \endcode 4891 /// 4892 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4893 /// 4894 /// \param m 4895 /// A pointer to the memory used for loading values. 4896 /// \param i 4897 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4898 /// \param s 4899 /// A literal constant scale factor for the indexes in \a i. Must be 4900 /// 1, 2, 4, or 8. 4901 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4902 #define _mm_i32gather_ps(m, i, s) \ 4903 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ 4904 (float const *)(m), \ 4905 (__v4si)(__m128i)(i), \ 4906 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4907 _mm_setzero_ps()), \ 4908 (s))) 4909 4910 /// Gathers eight 32-bit floating-point values from memory \a m using scaled 4911 /// indexes from the 256-bit vector of [8 x i32] in \a i. 4912 /// 4913 /// \code{.operation} 4914 /// FOR element := 0 to 7 4915 /// j := element*32 4916 /// k := element*32 4917 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4918 /// ENDFOR 4919 /// \endcode 4920 /// 4921 /// \headerfile <immintrin.h> 4922 /// 4923 /// \code 4924 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s); 4925 /// \endcode 4926 /// 4927 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4928 /// 4929 /// \param m 4930 /// A pointer to the memory used for loading values. 4931 /// \param i 4932 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4933 /// \param s 4934 /// A literal constant scale factor for the indexes in \a i. Must be 4935 /// 1, 2, 4, or 8. 4936 /// \returns A 256-bit vector of [8 x float] containing the gathered values. 4937 #define _mm256_i32gather_ps(m, i, s) \ 4938 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ 4939 (float const *)(m), \ 4940 (__v8si)(__m256i)(i), \ 4941 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ 4942 _mm256_setzero_ps(), \ 4943 _CMP_EQ_OQ), \ 4944 (s))) 4945 4946 /// Gathers two 32-bit floating-point values from memory \a m using scaled 4947 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two 4948 /// elements of the result are zeroed. 4949 /// 4950 /// \code{.operation} 4951 /// FOR element := 0 to 1 4952 /// j := element*32 4953 /// k := element*64 4954 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4955 /// ENDFOR 4956 /// result[127:64] := 0 4957 /// \endcode 4958 /// 4959 /// \headerfile <immintrin.h> 4960 /// 4961 /// \code 4962 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s); 4963 /// \endcode 4964 /// 4965 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4966 /// 4967 /// \param m 4968 /// A pointer to the memory used for loading values. 4969 /// \param i 4970 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4971 /// \param s 4972 /// A literal constant scale factor for the indexes in \a i. Must be 4973 /// 1, 2, 4, or 8. 4974 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4975 #define _mm_i64gather_ps(m, i, s) \ 4976 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ 4977 (float const *)(m), \ 4978 (__v2di)(__m128i)(i), \ 4979 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4980 _mm_setzero_ps()), \ 4981 (s))) 4982 4983 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4984 /// indexes from the 256-bit vector of [4 x i64] in \a i. 4985 /// 4986 /// \code{.operation} 4987 /// FOR element := 0 to 3 4988 /// j := element*32 4989 /// k := element*64 4990 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s) 4991 /// ENDFOR 4992 /// \endcode 4993 /// 4994 /// \headerfile <immintrin.h> 4995 /// 4996 /// \code 4997 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s); 4998 /// \endcode 4999 /// 5000 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 5001 /// 5002 /// \param m 5003 /// A pointer to the memory used for loading values. 5004 /// \param i 5005 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5006 /// \param s 5007 /// A literal constant scale factor for the indexes in \a i. Must be 5008 /// 1, 2, 4, or 8. 5009 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 5010 #define _mm256_i64gather_ps(m, i, s) \ 5011 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ 5012 (float const *)(m), \ 5013 (__v4di)(__m256i)(i), \ 5014 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 5015 _mm_setzero_ps()), \ 5016 (s))) 5017 5018 /// Gathers four 32-bit floating-point values from memory \a m using scaled 5019 /// indexes from the 128-bit vector of [4 x i32] in \a i. 5020 /// 5021 /// \code{.operation} 5022 /// FOR element := 0 to 3 5023 /// j := element*32 5024 /// k := element*32 5025 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5026 /// ENDFOR 5027 /// \endcode 5028 /// 5029 /// \headerfile <immintrin.h> 5030 /// 5031 /// \code 5032 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s); 5033 /// \endcode 5034 /// 5035 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 5036 /// 5037 /// \param m 5038 /// A pointer to the memory used for loading values. 5039 /// \param i 5040 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5041 /// \param s 5042 /// A literal constant scale factor for the indexes in \a i. Must be 5043 /// 1, 2, 4, or 8. 5044 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5045 #define _mm_i32gather_epi32(m, i, s) \ 5046 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ 5047 (int const *)(m), (__v4si)(__m128i)(i), \ 5048 (__v4si)_mm_set1_epi32(-1), (s))) 5049 5050 /// Gathers eight 32-bit floating-point values from memory \a m using scaled 5051 /// indexes from the 256-bit vector of [8 x i32] in \a i. 5052 /// 5053 /// \code{.operation} 5054 /// FOR element := 0 to 7 5055 /// j := element*32 5056 /// k := element*32 5057 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5058 /// ENDFOR 5059 /// \endcode 5060 /// 5061 /// \headerfile <immintrin.h> 5062 /// 5063 /// \code 5064 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s); 5065 /// \endcode 5066 /// 5067 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 5068 /// 5069 /// \param m 5070 /// A pointer to the memory used for loading values. 5071 /// \param i 5072 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 5073 /// \param s 5074 /// A literal constant scale factor for the indexes in \a i. Must be 5075 /// 1, 2, 4, or 8. 5076 /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 5077 #define _mm256_i32gather_epi32(m, i, s) \ 5078 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ 5079 (int const *)(m), (__v8si)(__m256i)(i), \ 5080 (__v8si)_mm256_set1_epi32(-1), (s))) 5081 5082 /// Gathers two 32-bit integer values from memory \a m using scaled indexes 5083 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements 5084 /// of the result are zeroed. 5085 /// 5086 /// \code{.operation} 5087 /// FOR element := 0 to 1 5088 /// j := element*32 5089 /// k := element*64 5090 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5091 /// ENDFOR 5092 /// result[127:64] := 0 5093 /// \endcode 5094 /// 5095 /// \headerfile <immintrin.h> 5096 /// 5097 /// \code 5098 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s); 5099 /// \endcode 5100 /// 5101 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 5102 /// 5103 /// \param m 5104 /// A pointer to the memory used for loading values. 5105 /// \param i 5106 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5107 /// \param s 5108 /// A literal constant scale factor for the indexes in \a i. Must be 5109 /// 1, 2, 4, or 8. 5110 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5111 #define _mm_i64gather_epi32(m, i, s) \ 5112 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ 5113 (int const *)(m), (__v2di)(__m128i)(i), \ 5114 (__v4si)_mm_set1_epi32(-1), (s))) 5115 5116 /// Gathers four 32-bit integer values from memory \a m using scaled indexes 5117 /// from the 256-bit vector of [4 x i64] in \a i. 5118 /// 5119 /// \code{.operation} 5120 /// FOR element := 0 to 3 5121 /// j := element*32 5122 /// k := element*64 5123 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5124 /// ENDFOR 5125 /// \endcode 5126 /// 5127 /// \headerfile <immintrin.h> 5128 /// 5129 /// \code 5130 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s); 5131 /// \endcode 5132 /// 5133 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 5134 /// 5135 /// \param m 5136 /// A pointer to the memory used for loading values. 5137 /// \param i 5138 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5139 /// \param s 5140 /// A literal constant scale factor for the indexes in \a i. Must be 5141 /// 1, 2, 4, or 8. 5142 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5143 #define _mm256_i64gather_epi32(m, i, s) \ 5144 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ 5145 (int const *)(m), (__v4di)(__m256i)(i), \ 5146 (__v4si)_mm_set1_epi32(-1), (s))) 5147 5148 /// Gathers two 64-bit integer values from memory \a m using scaled indexes 5149 /// from the 128-bit vector of [4 x i32] in \a i. 5150 /// 5151 /// \code{.operation} 5152 /// FOR element := 0 to 1 5153 /// j := element*64 5154 /// k := element*32 5155 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5156 /// ENDFOR 5157 /// \endcode 5158 /// 5159 /// \headerfile <immintrin.h> 5160 /// 5161 /// \code 5162 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s); 5163 /// \endcode 5164 /// 5165 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5166 /// 5167 /// \param m 5168 /// A pointer to the memory used for loading values. 5169 /// \param i 5170 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 5171 /// the first two elements are used. 5172 /// \param s 5173 /// A literal constant scale factor for the indexes in \a i. Must be 5174 /// 1, 2, 4, or 8. 5175 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5176 #define _mm_i32gather_epi64(m, i, s) \ 5177 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ 5178 (long long const *)(m), \ 5179 (__v4si)(__m128i)(i), \ 5180 (__v2di)_mm_set1_epi64x(-1), (s))) 5181 5182 /// Gathers four 64-bit integer values from memory \a m using scaled indexes 5183 /// from the 128-bit vector of [4 x i32] in \a i. 5184 /// 5185 /// \code{.operation} 5186 /// FOR element := 0 to 3 5187 /// j := element*64 5188 /// k := element*32 5189 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5190 /// ENDFOR 5191 /// \endcode 5192 /// 5193 /// \headerfile <immintrin.h> 5194 /// 5195 /// \code 5196 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s); 5197 /// \endcode 5198 /// 5199 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5200 /// 5201 /// \param m 5202 /// A pointer to the memory used for loading values. 5203 /// \param i 5204 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5205 /// \param s 5206 /// A literal constant scale factor for the indexes in \a i. Must be 5207 /// 1, 2, 4, or 8. 5208 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5209 #define _mm256_i32gather_epi64(m, i, s) \ 5210 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ 5211 (long long const *)(m), \ 5212 (__v4si)(__m128i)(i), \ 5213 (__v4di)_mm256_set1_epi64x(-1), (s))) 5214 5215 /// Gathers two 64-bit integer values from memory \a m using scaled indexes 5216 /// from the 128-bit vector of [2 x i64] in \a i. 5217 /// 5218 /// \code{.operation} 5219 /// FOR element := 0 to 1 5220 /// j := element*64 5221 /// k := element*64 5222 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5223 /// ENDFOR 5224 /// \endcode 5225 /// 5226 /// \headerfile <immintrin.h> 5227 /// 5228 /// \code 5229 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s); 5230 /// \endcode 5231 /// 5232 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5233 /// 5234 /// \param m 5235 /// A pointer to the memory used for loading values. 5236 /// \param i 5237 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5238 /// \param s 5239 /// A literal constant scale factor for the indexes in \a i. Must be 5240 /// 1, 2, 4, or 8. 5241 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5242 #define _mm_i64gather_epi64(m, i, s) \ 5243 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ 5244 (long long const *)(m), \ 5245 (__v2di)(__m128i)(i), \ 5246 (__v2di)_mm_set1_epi64x(-1), (s))) 5247 5248 /// Gathers four 64-bit integer values from memory \a m using scaled indexes 5249 /// from the 256-bit vector of [4 x i64] in \a i. 5250 /// 5251 /// \code{.operation} 5252 /// FOR element := 0 to 3 5253 /// j := element*64 5254 /// k := element*64 5255 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5256 /// ENDFOR 5257 /// \endcode 5258 /// 5259 /// \headerfile <immintrin.h> 5260 /// 5261 /// \code 5262 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s); 5263 /// \endcode 5264 /// 5265 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5266 /// 5267 /// \param m 5268 /// A pointer to the memory used for loading values. 5269 /// \param i 5270 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5271 /// \param s 5272 /// A literal constant scale factor for the indexes in \a i. Must be 5273 /// 1, 2, 4, or 8. 5274 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5275 #define _mm256_i64gather_epi64(m, i, s) \ 5276 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ 5277 (long long const *)(m), \ 5278 (__v4di)(__m256i)(i), \ 5279 (__v4di)_mm256_set1_epi64x(-1), (s))) 5280 5281 #undef __DEFAULT_FN_ATTRS256 5282 #undef __DEFAULT_FN_ATTRS128 5283 5284 #endif /* __AVX2INTRIN_H */ 5285