1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX2INTRIN_H 15 #define __AVX2INTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256))) 19 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128))) 20 21 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 22 /// Computes sixteen sum of absolute difference (SAD) operations on sets of 23 /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and 24 /// \a Y. 25 /// 26 /// Eight SAD results are computed using the lower half of the input 27 /// vectors, and another eight using the upper half. These 16-bit values 28 /// are returned in the lower and upper halves of the 256-bit result, 29 /// respectively. 30 /// 31 /// A single SAD operation selects four bytes from \a X and four bytes from 32 /// \a Y as input. It computes the differences between each \a X byte and 33 /// the corresponding \a Y byte, takes the absolute value of each 34 /// difference, and sums these four values to form one 16-bit result. The 35 /// intrinsic computes 16 of these results with different sets of input 36 /// bytes. 37 /// 38 /// For each set of eight results, the SAD operations use the same four 39 /// bytes from \a Y; the starting bit position for these four bytes is 40 /// specified by \a M[1:0] times 32. The eight operations use successive 41 /// sets of four bytes from \a X; the starting bit position for the first 42 /// set of four bytes is specified by \a M[2] times 32. These bit positions 43 /// are all relative to the 128-bit lane for each set of eight operations. 44 /// 45 /// \code{.operation} 46 /// r := 0 47 /// FOR i := 0 TO 1 48 /// j := i*3 49 /// Ybase := M[j+1:j]*32 + i*128 50 /// Xbase := M[j+2]*32 + i*128 51 /// FOR k := 0 TO 3 52 /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase]) 53 /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8]) 54 /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16]) 55 /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24]) 56 /// result[r+15:r] := temp0 + temp1 + temp2 + temp3 57 /// Xbase := Xbase + 8 58 /// r := r + 16 59 /// ENDFOR 60 /// ENDFOR 61 /// \endcode 62 /// 63 /// \headerfile <immintrin.h> 64 /// 65 /// \code 66 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M); 67 /// \endcode 68 /// 69 /// This intrinsic corresponds to the \c VMPSADBW instruction. 70 /// 71 /// \param X 72 /// A 256-bit integer vector containing one of the inputs. 73 /// \param Y 74 /// A 256-bit integer vector containing one of the inputs. 75 /// \param M 76 /// An unsigned immediate value specifying the starting positions of the 77 /// bytes to operate on. 78 /// \returns A 256-bit vector of [16 x i16] containing the result. 79 #define _mm256_mpsadbw_epu8(X, Y, M) \ 80 ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ 81 (__v32qi)(__m256i)(Y), (int)(M))) 82 83 /// Computes the absolute value of each signed byte in the 256-bit integer 84 /// vector \a __a and returns each value in the corresponding byte of 85 /// the result. 86 /// 87 /// \headerfile <immintrin.h> 88 /// 89 /// This intrinsic corresponds to the \c VPABSB instruction. 90 /// 91 /// \param __a 92 /// A 256-bit integer vector. 93 /// \returns A 256-bit integer vector containing the result. 94 static __inline__ __m256i __DEFAULT_FN_ATTRS256 95 _mm256_abs_epi8(__m256i __a) 96 { 97 return (__m256i)__builtin_elementwise_abs((__v32qs)__a); 98 } 99 100 /// Computes the absolute value of each signed 16-bit element in the 256-bit 101 /// vector of [16 x i16] in \a __a and returns each value in the 102 /// corresponding element of the result. 103 /// 104 /// \headerfile <immintrin.h> 105 /// 106 /// This intrinsic corresponds to the \c VPABSW instruction. 107 /// 108 /// \param __a 109 /// A 256-bit vector of [16 x i16]. 110 /// \returns A 256-bit vector of [16 x i16] containing the result. 111 static __inline__ __m256i __DEFAULT_FN_ATTRS256 112 _mm256_abs_epi16(__m256i __a) 113 { 114 return (__m256i)__builtin_elementwise_abs((__v16hi)__a); 115 } 116 117 /// Computes the absolute value of each signed 32-bit element in the 256-bit 118 /// vector of [8 x i32] in \a __a and returns each value in the 119 /// corresponding element of the result. 120 /// 121 /// \headerfile <immintrin.h> 122 /// 123 /// This intrinsic corresponds to the \c VPABSD instruction. 124 /// 125 /// \param __a 126 /// A 256-bit vector of [8 x i32]. 127 /// \returns A 256-bit vector of [8 x i32] containing the result. 128 static __inline__ __m256i __DEFAULT_FN_ATTRS256 129 _mm256_abs_epi32(__m256i __a) 130 { 131 return (__m256i)__builtin_elementwise_abs((__v8si)__a); 132 } 133 134 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit 135 /// integers using signed saturation, and returns the 256-bit result. 136 /// 137 /// \code{.operation} 138 /// FOR i := 0 TO 7 139 /// j := i*16 140 /// k := i*8 141 /// result[7+k:k] := SATURATE8(__a[15+j:j]) 142 /// result[71+k:64+k] := SATURATE8(__b[15+j:j]) 143 /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j]) 144 /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j]) 145 /// ENDFOR 146 /// \endcode 147 /// 148 /// \headerfile <immintrin.h> 149 /// 150 /// This intrinsic corresponds to the \c VPACKSSWB instruction. 151 /// 152 /// \param __a 153 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 154 /// result[191:128]. 155 /// \param __b 156 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 157 /// result[255:192]. 158 /// \returns A 256-bit integer vector containing the result. 159 static __inline__ __m256i __DEFAULT_FN_ATTRS256 160 _mm256_packs_epi16(__m256i __a, __m256i __b) 161 { 162 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 163 } 164 165 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit 166 /// integers using signed saturation, and returns the resulting 256-bit 167 /// vector of [16 x i16]. 168 /// 169 /// \code{.operation} 170 /// FOR i := 0 TO 3 171 /// j := i*32 172 /// k := i*16 173 /// result[15+k:k] := SATURATE16(__a[31+j:j]) 174 /// result[79+k:64+k] := SATURATE16(__b[31+j:j]) 175 /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j]) 176 /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j]) 177 /// ENDFOR 178 /// \endcode 179 /// 180 /// \headerfile <immintrin.h> 181 /// 182 /// This intrinsic corresponds to the \c VPACKSSDW instruction. 183 /// 184 /// \param __a 185 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 186 /// result[191:128]. 187 /// \param __b 188 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 189 /// result[255:192]. 190 /// \returns A 256-bit vector of [16 x i16] containing the result. 191 static __inline__ __m256i __DEFAULT_FN_ATTRS256 192 _mm256_packs_epi32(__m256i __a, __m256i __b) 193 { 194 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 195 } 196 197 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers 198 /// using unsigned saturation, and returns the 256-bit result. 199 /// 200 /// \code{.operation} 201 /// FOR i := 0 TO 7 202 /// j := i*16 203 /// k := i*8 204 /// result[7+k:k] := SATURATE8U(__a[15+j:j]) 205 /// result[71+k:64+k] := SATURATE8U(__b[15+j:j]) 206 /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j]) 207 /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j]) 208 /// ENDFOR 209 /// \endcode 210 /// 211 /// \headerfile <immintrin.h> 212 /// 213 /// This intrinsic corresponds to the \c VPACKUSWB instruction. 214 /// 215 /// \param __a 216 /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 217 /// result[191:128]. 218 /// \param __b 219 /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 220 /// result[255:192]. 221 /// \returns A 256-bit integer vector containing the result. 222 static __inline__ __m256i __DEFAULT_FN_ATTRS256 223 _mm256_packus_epi16(__m256i __a, __m256i __b) 224 { 225 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 226 } 227 228 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers 229 /// using unsigned saturation, and returns the resulting 256-bit vector of 230 /// [16 x i16]. 231 /// 232 /// \code{.operation} 233 /// FOR i := 0 TO 3 234 /// j := i*32 235 /// k := i*16 236 /// result[15+k:k] := SATURATE16U(__V1[31+j:j]) 237 /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j]) 238 /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j]) 239 /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j]) 240 /// ENDFOR 241 /// \endcode 242 /// 243 /// \headerfile <immintrin.h> 244 /// 245 /// This intrinsic corresponds to the \c VPACKUSDW instruction. 246 /// 247 /// \param __V1 248 /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 249 /// result[191:128]. 250 /// \param __V2 251 /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 252 /// result[255:192]. 253 /// \returns A 256-bit vector of [16 x i16] containing the result. 254 static __inline__ __m256i __DEFAULT_FN_ATTRS256 255 _mm256_packus_epi32(__m256i __V1, __m256i __V2) 256 { 257 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 258 } 259 260 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 261 /// vectors and returns the lower 8 bits of each sum in the corresponding 262 /// byte of the 256-bit integer vector result (overflow is ignored). 263 /// 264 /// \headerfile <immintrin.h> 265 /// 266 /// This intrinsic corresponds to the \c VPADDB instruction. 267 /// 268 /// \param __a 269 /// A 256-bit integer vector containing one of the source operands. 270 /// \param __b 271 /// A 256-bit integer vector containing one of the source operands. 272 /// \returns A 256-bit integer vector containing the sums. 273 static __inline__ __m256i __DEFAULT_FN_ATTRS256 274 _mm256_add_epi8(__m256i __a, __m256i __b) 275 { 276 return (__m256i)((__v32qu)__a + (__v32qu)__b); 277 } 278 279 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 280 /// [16 x i16] and returns the lower 16 bits of each sum in the 281 /// corresponding element of the [16 x i16] result (overflow is ignored). 282 /// 283 /// \headerfile <immintrin.h> 284 /// 285 /// This intrinsic corresponds to the \c VPADDW instruction. 286 /// 287 /// \param __a 288 /// A 256-bit vector of [16 x i16] containing one of the source operands. 289 /// \param __b 290 /// A 256-bit vector of [16 x i16] containing one of the source operands. 291 /// \returns A 256-bit vector of [16 x i16] containing the sums. 292 static __inline__ __m256i __DEFAULT_FN_ATTRS256 293 _mm256_add_epi16(__m256i __a, __m256i __b) 294 { 295 return (__m256i)((__v16hu)__a + (__v16hu)__b); 296 } 297 298 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of 299 /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding 300 /// element of the [8 x i32] result (overflow is ignored). 301 /// 302 /// \headerfile <immintrin.h> 303 /// 304 /// This intrinsic corresponds to the \c VPADDD instruction. 305 /// 306 /// \param __a 307 /// A 256-bit vector of [8 x i32] containing one of the source operands. 308 /// \param __b 309 /// A 256-bit vector of [8 x i32] containing one of the source operands. 310 /// \returns A 256-bit vector of [8 x i32] containing the sums. 311 static __inline__ __m256i __DEFAULT_FN_ATTRS256 312 _mm256_add_epi32(__m256i __a, __m256i __b) 313 { 314 return (__m256i)((__v8su)__a + (__v8su)__b); 315 } 316 317 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of 318 /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding 319 /// element of the [4 x i64] result (overflow is ignored). 320 /// 321 /// \headerfile <immintrin.h> 322 /// 323 /// This intrinsic corresponds to the \c VPADDQ instruction. 324 /// 325 /// \param __a 326 /// A 256-bit vector of [4 x i64] containing one of the source operands. 327 /// \param __b 328 /// A 256-bit vector of [4 x i64] containing one of the source operands. 329 /// \returns A 256-bit vector of [4 x i64] containing the sums. 330 static __inline__ __m256i __DEFAULT_FN_ATTRS256 331 _mm256_add_epi64(__m256i __a, __m256i __b) 332 { 333 return (__m256i)((__v4du)__a + (__v4du)__b); 334 } 335 336 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 337 /// vectors using signed saturation, and returns each sum in the 338 /// corresponding byte of the 256-bit integer vector result. 339 /// 340 /// \headerfile <immintrin.h> 341 /// 342 /// This intrinsic corresponds to the \c VPADDSB instruction. 343 /// 344 /// \param __a 345 /// A 256-bit integer vector containing one of the source operands. 346 /// \param __b 347 /// A 256-bit integer vector containing one of the source operands. 348 /// \returns A 256-bit integer vector containing the sums. 349 static __inline__ __m256i __DEFAULT_FN_ATTRS256 350 _mm256_adds_epi8(__m256i __a, __m256i __b) 351 { 352 return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); 353 } 354 355 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 356 /// [16 x i16] using signed saturation, and returns the [16 x i16] result. 357 /// 358 /// \headerfile <immintrin.h> 359 /// 360 /// This intrinsic corresponds to the \c VPADDSW instruction. 361 /// 362 /// \param __a 363 /// A 256-bit vector of [16 x i16] containing one of the source operands. 364 /// \param __b 365 /// A 256-bit vector of [16 x i16] containing one of the source operands. 366 /// \returns A 256-bit vector of [16 x i16] containing the sums. 367 static __inline__ __m256i __DEFAULT_FN_ATTRS256 368 _mm256_adds_epi16(__m256i __a, __m256i __b) 369 { 370 return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); 371 } 372 373 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 374 /// vectors using unsigned saturation, and returns each sum in the 375 /// corresponding byte of the 256-bit integer vector result. 376 /// 377 /// \headerfile <immintrin.h> 378 /// 379 /// This intrinsic corresponds to the \c VPADDUSB instruction. 380 /// 381 /// \param __a 382 /// A 256-bit integer vector containing one of the source operands. 383 /// \param __b 384 /// A 256-bit integer vector containing one of the source operands. 385 /// \returns A 256-bit integer vector containing the sums. 386 static __inline__ __m256i __DEFAULT_FN_ATTRS256 387 _mm256_adds_epu8(__m256i __a, __m256i __b) 388 { 389 return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); 390 } 391 392 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 393 /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result. 394 /// 395 /// \headerfile <immintrin.h> 396 /// 397 /// This intrinsic corresponds to the \c VPADDUSW instruction. 398 /// 399 /// \param __a 400 /// A 256-bit vector of [16 x i16] containing one of the source operands. 401 /// \param __b 402 /// A 256-bit vector of [16 x i16] containing one of the source operands. 403 /// \returns A 256-bit vector of [16 x i16] containing the sums. 404 static __inline__ __m256i __DEFAULT_FN_ATTRS256 405 _mm256_adds_epu16(__m256i __a, __m256i __b) 406 { 407 return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); 408 } 409 410 /// Uses the lower half of the 256-bit vector \a a as the upper half of a 411 /// temporary 256-bit value, and the lower half of the 256-bit vector \a b 412 /// as the lower half of the temporary value. Right-shifts the temporary 413 /// value by \a n bytes, and uses the lower 16 bytes of the shifted value 414 /// as the lower 16 bytes of the result. Uses the upper halves of \a a and 415 /// \a b to make another temporary value, right shifts by \a n, and uses 416 /// the lower 16 bytes of the shifted value as the upper 16 bytes of the 417 /// result. 418 /// 419 /// \headerfile <immintrin.h> 420 /// 421 /// \code 422 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n); 423 /// \endcode 424 /// 425 /// This intrinsic corresponds to the \c VPALIGNR instruction. 426 /// 427 /// \param a 428 /// A 256-bit integer vector containing source values. 429 /// \param b 430 /// A 256-bit integer vector containing source values. 431 /// \param n 432 /// An immediate value specifying the number of bytes to shift. 433 /// \returns A 256-bit integer vector containing the result. 434 #define _mm256_alignr_epi8(a, b, n) \ 435 ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ 436 (__v32qi)(__m256i)(b), (n))) 437 438 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and 439 /// \a __b. 440 /// 441 /// \headerfile <immintrin.h> 442 /// 443 /// This intrinsic corresponds to the \c VPAND instruction. 444 /// 445 /// \param __a 446 /// A 256-bit integer vector. 447 /// \param __b 448 /// A 256-bit integer vector. 449 /// \returns A 256-bit integer vector containing the result. 450 static __inline__ __m256i __DEFAULT_FN_ATTRS256 451 _mm256_and_si256(__m256i __a, __m256i __b) 452 { 453 return (__m256i)((__v4du)__a & (__v4du)__b); 454 } 455 456 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with 457 /// the bitwise NOT of the 256-bit integer vector in \a __a. 458 /// 459 /// \headerfile <immintrin.h> 460 /// 461 /// This intrinsic corresponds to the \c VPANDN instruction. 462 /// 463 /// \param __a 464 /// A 256-bit integer vector. 465 /// \param __b 466 /// A 256-bit integer vector. 467 /// \returns A 256-bit integer vector containing the result. 468 static __inline__ __m256i __DEFAULT_FN_ATTRS256 469 _mm256_andnot_si256(__m256i __a, __m256i __b) 470 { 471 return (__m256i)(~(__v4du)__a & (__v4du)__b); 472 } 473 474 /// Computes the averages of the corresponding unsigned bytes in the two 475 /// 256-bit integer vectors in \a __a and \a __b and returns each 476 /// average in the corresponding byte of the 256-bit result. 477 /// 478 /// \code{.operation} 479 /// FOR i := 0 TO 31 480 /// j := i*8 481 /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1 482 /// ENDFOR 483 /// \endcode 484 /// 485 /// \headerfile <immintrin.h> 486 /// 487 /// This intrinsic corresponds to the \c VPAVGB instruction. 488 /// 489 /// \param __a 490 /// A 256-bit integer vector. 491 /// \param __b 492 /// A 256-bit integer vector. 493 /// \returns A 256-bit integer vector containing the result. 494 static __inline__ __m256i __DEFAULT_FN_ATTRS256 495 _mm256_avg_epu8(__m256i __a, __m256i __b) 496 { 497 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); 498 } 499 500 /// Computes the averages of the corresponding unsigned 16-bit integers in 501 /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns 502 /// each average in the corresponding element of the 256-bit result. 503 /// 504 /// \code{.operation} 505 /// FOR i := 0 TO 15 506 /// j := i*16 507 /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1 508 /// ENDFOR 509 /// \endcode 510 /// 511 /// \headerfile <immintrin.h> 512 /// 513 /// This intrinsic corresponds to the \c VPAVGW instruction. 514 /// 515 /// \param __a 516 /// A 256-bit vector of [16 x i16]. 517 /// \param __b 518 /// A 256-bit vector of [16 x i16]. 519 /// \returns A 256-bit vector of [16 x i16] containing the result. 520 static __inline__ __m256i __DEFAULT_FN_ATTRS256 521 _mm256_avg_epu16(__m256i __a, __m256i __b) 522 { 523 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); 524 } 525 526 /// Merges 8-bit integer values from either of the two 256-bit vectors 527 /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns 528 /// the resulting 256-bit integer vector. 529 /// 530 /// \code{.operation} 531 /// FOR i := 0 TO 31 532 /// j := i*8 533 /// IF __M[7+i] == 0 534 /// result[7+j:j] := __V1[7+j:j] 535 /// ELSE 536 /// result[7+j:j] := __V2[7+j:j] 537 /// FI 538 /// ENDFOR 539 /// \endcode 540 /// 541 /// \headerfile <immintrin.h> 542 /// 543 /// This intrinsic corresponds to the \c VPBLENDVB instruction. 544 /// 545 /// \param __V1 546 /// A 256-bit integer vector containing source values. 547 /// \param __V2 548 /// A 256-bit integer vector containing source values. 549 /// \param __M 550 /// A 256-bit integer vector, with bit [7] of each byte specifying the 551 /// source for each corresponding byte of the result. When the mask bit 552 /// is 0, the byte is copied from \a __V1; otherwise, it is copied from 553 /// \a __V2. 554 /// \returns A 256-bit integer vector containing the result. 555 static __inline__ __m256i __DEFAULT_FN_ATTRS256 556 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 557 { 558 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 559 (__v32qi)__M); 560 } 561 562 /// Merges 16-bit integer values from either of the two 256-bit vectors 563 /// \a V1 or \a V2, as specified by the immediate integer operand \a M, 564 /// and returns the resulting 256-bit vector of [16 x i16]. 565 /// 566 /// \code{.operation} 567 /// FOR i := 0 TO 7 568 /// j := i*16 569 /// IF M[i] == 0 570 /// result[7+j:j] := V1[7+j:j] 571 /// result[135+j:128+j] := V1[135+j:128+j] 572 /// ELSE 573 /// result[7+j:j] := V2[7+j:j] 574 /// result[135+j:128+j] := V2[135+j:128+j] 575 /// FI 576 /// ENDFOR 577 /// \endcode 578 /// 579 /// \headerfile <immintrin.h> 580 /// 581 /// \code 582 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M); 583 /// \endcode 584 /// 585 /// This intrinsic corresponds to the \c VPBLENDW instruction. 586 /// 587 /// \param V1 588 /// A 256-bit vector of [16 x i16] containing source values. 589 /// \param V2 590 /// A 256-bit vector of [16 x i16] containing source values. 591 /// \param M 592 /// An immediate 8-bit integer operand, with bits [7:0] specifying the 593 /// source for each element of the result. The position of the mask bit 594 /// corresponds to the index of a copied value. When a mask bit is 0, the 595 /// element is copied from \a V1; otherwise, it is copied from \a V2. 596 /// \a M[0] determines the source for elements 0 and 8, \a M[1] for 597 /// elements 1 and 9, and so forth. 598 /// \returns A 256-bit vector of [16 x i16] containing the result. 599 #define _mm256_blend_epi16(V1, V2, M) \ 600 ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ 601 (__v16hi)(__m256i)(V2), (int)(M))) 602 603 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and 604 /// \a __b for equality and returns the outcomes in the corresponding 605 /// bytes of the 256-bit result. 606 /// 607 /// \code{.operation} 608 /// FOR i := 0 TO 31 609 /// j := i*8 610 /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0 611 /// ENDFOR 612 /// \endcode 613 /// 614 /// \headerfile <immintrin.h> 615 /// 616 /// This intrinsic corresponds to the \c VPCMPEQB instruction. 617 /// 618 /// \param __a 619 /// A 256-bit integer vector containing one of the inputs. 620 /// \param __b 621 /// A 256-bit integer vector containing one of the inputs. 622 /// \returns A 256-bit integer vector containing the result. 623 static __inline__ __m256i __DEFAULT_FN_ATTRS256 624 _mm256_cmpeq_epi8(__m256i __a, __m256i __b) 625 { 626 return (__m256i)((__v32qi)__a == (__v32qi)__b); 627 } 628 629 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in 630 /// \a __a and \a __b for equality and returns the outcomes in the 631 /// corresponding elements of the 256-bit result. 632 /// 633 /// \code{.operation} 634 /// FOR i := 0 TO 15 635 /// j := i*16 636 /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0 637 /// ENDFOR 638 /// \endcode 639 /// 640 /// \headerfile <immintrin.h> 641 /// 642 /// This intrinsic corresponds to the \c VPCMPEQW instruction. 643 /// 644 /// \param __a 645 /// A 256-bit vector of [16 x i16] containing one of the inputs. 646 /// \param __b 647 /// A 256-bit vector of [16 x i16] containing one of the inputs. 648 /// \returns A 256-bit vector of [16 x i16] containing the result. 649 static __inline__ __m256i __DEFAULT_FN_ATTRS256 650 _mm256_cmpeq_epi16(__m256i __a, __m256i __b) 651 { 652 return (__m256i)((__v16hi)__a == (__v16hi)__b); 653 } 654 655 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in 656 /// \a __a and \a __b for equality and returns the outcomes in the 657 /// corresponding elements of the 256-bit result. 658 /// 659 /// \code{.operation} 660 /// FOR i := 0 TO 7 661 /// j := i*32 662 /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0 663 /// ENDFOR 664 /// \endcode 665 /// 666 /// \headerfile <immintrin.h> 667 /// 668 /// This intrinsic corresponds to the \c VPCMPEQD instruction. 669 /// 670 /// \param __a 671 /// A 256-bit vector of [8 x i32] containing one of the inputs. 672 /// \param __b 673 /// A 256-bit vector of [8 x i32] containing one of the inputs. 674 /// \returns A 256-bit vector of [8 x i32] containing the result. 675 static __inline__ __m256i __DEFAULT_FN_ATTRS256 676 _mm256_cmpeq_epi32(__m256i __a, __m256i __b) 677 { 678 return (__m256i)((__v8si)__a == (__v8si)__b); 679 } 680 681 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in 682 /// \a __a and \a __b for equality and returns the outcomes in the 683 /// corresponding elements of the 256-bit result. 684 /// 685 /// \code{.operation} 686 /// FOR i := 0 TO 3 687 /// j := i*64 688 /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 689 /// ENDFOR 690 /// \endcode 691 /// 692 /// \headerfile <immintrin.h> 693 /// 694 /// This intrinsic corresponds to the \c VPCMPEQQ instruction. 695 /// 696 /// \param __a 697 /// A 256-bit vector of [4 x i64] containing one of the inputs. 698 /// \param __b 699 /// A 256-bit vector of [4 x i64] containing one of the inputs. 700 /// \returns A 256-bit vector of [4 x i64] containing the result. 701 static __inline__ __m256i __DEFAULT_FN_ATTRS256 702 _mm256_cmpeq_epi64(__m256i __a, __m256i __b) 703 { 704 return (__m256i)((__v4di)__a == (__v4di)__b); 705 } 706 707 /// Compares corresponding signed bytes in the 256-bit integer vectors in 708 /// \a __a and \a __b for greater-than and returns the outcomes in the 709 /// corresponding bytes of the 256-bit result. 710 /// 711 /// \code{.operation} 712 /// FOR i := 0 TO 31 713 /// j := i*8 714 /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0 715 /// ENDFOR 716 /// \endcode 717 /// 718 /// \headerfile <immintrin.h> 719 /// 720 /// This intrinsic corresponds to the \c VPCMPGTB instruction. 721 /// 722 /// \param __a 723 /// A 256-bit integer vector containing one of the inputs. 724 /// \param __b 725 /// A 256-bit integer vector containing one of the inputs. 726 /// \returns A 256-bit integer vector containing the result. 727 static __inline__ __m256i __DEFAULT_FN_ATTRS256 728 _mm256_cmpgt_epi8(__m256i __a, __m256i __b) 729 { 730 /* This function always performs a signed comparison, but __v32qi is a char 731 which may be signed or unsigned, so use __v32qs. */ 732 return (__m256i)((__v32qs)__a > (__v32qs)__b); 733 } 734 735 /// Compares corresponding signed elements in the 256-bit vectors of 736 /// [16 x i16] in \a __a and \a __b for greater-than and returns the 737 /// outcomes in the corresponding elements of the 256-bit result. 738 /// 739 /// \code{.operation} 740 /// FOR i := 0 TO 15 741 /// j := i*16 742 /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0 743 /// ENDFOR 744 /// \endcode 745 /// 746 /// \headerfile <immintrin.h> 747 /// 748 /// This intrinsic corresponds to the \c VPCMPGTW instruction. 749 /// 750 /// \param __a 751 /// A 256-bit vector of [16 x i16] containing one of the inputs. 752 /// \param __b 753 /// A 256-bit vector of [16 x i16] containing one of the inputs. 754 /// \returns A 256-bit vector of [16 x i16] containing the result. 755 static __inline__ __m256i __DEFAULT_FN_ATTRS256 756 _mm256_cmpgt_epi16(__m256i __a, __m256i __b) 757 { 758 return (__m256i)((__v16hi)__a > (__v16hi)__b); 759 } 760 761 /// Compares corresponding signed elements in the 256-bit vectors of 762 /// [8 x i32] in \a __a and \a __b for greater-than and returns the 763 /// outcomes in the corresponding elements of the 256-bit result. 764 /// 765 /// \code{.operation} 766 /// FOR i := 0 TO 7 767 /// j := i*32 768 /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0 769 /// ENDFOR 770 /// \endcode 771 /// 772 /// \headerfile <immintrin.h> 773 /// 774 /// This intrinsic corresponds to the \c VPCMPGTD instruction. 775 /// 776 /// \param __a 777 /// A 256-bit vector of [8 x i32] containing one of the inputs. 778 /// \param __b 779 /// A 256-bit vector of [8 x i32] containing one of the inputs. 780 /// \returns A 256-bit vector of [8 x i32] containing the result. 781 static __inline__ __m256i __DEFAULT_FN_ATTRS256 782 _mm256_cmpgt_epi32(__m256i __a, __m256i __b) 783 { 784 return (__m256i)((__v8si)__a > (__v8si)__b); 785 } 786 787 /// Compares corresponding signed elements in the 256-bit vectors of 788 /// [4 x i64] in \a __a and \a __b for greater-than and returns the 789 /// outcomes in the corresponding elements of the 256-bit result. 790 /// 791 /// \code{.operation} 792 /// FOR i := 0 TO 3 793 /// j := i*64 794 /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 795 /// ENDFOR 796 /// \endcode 797 /// 798 /// \headerfile <immintrin.h> 799 /// 800 /// This intrinsic corresponds to the \c VPCMPGTQ instruction. 801 /// 802 /// \param __a 803 /// A 256-bit vector of [4 x i64] containing one of the inputs. 804 /// \param __b 805 /// A 256-bit vector of [4 x i64] containing one of the inputs. 806 /// \returns A 256-bit vector of [4 x i64] containing the result. 807 static __inline__ __m256i __DEFAULT_FN_ATTRS256 808 _mm256_cmpgt_epi64(__m256i __a, __m256i __b) 809 { 810 return (__m256i)((__v4di)__a > (__v4di)__b); 811 } 812 813 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 814 /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an 815 /// element of the [16 x i16] result (overflow is ignored). Sums from 816 /// \a __a are returned in the lower 64 bits of each 128-bit half of the 817 /// result; sums from \a __b are returned in the upper 64 bits of each 818 /// 128-bit half of the result. 819 /// 820 /// \code{.operation} 821 /// FOR i := 0 TO 1 822 /// j := i*128 823 /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16] 824 /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48] 825 /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80] 826 /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112] 827 /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16] 828 /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48] 829 /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80] 830 /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112] 831 /// ENDFOR 832 /// \endcode 833 /// 834 /// \headerfile <immintrin.h> 835 /// 836 /// This intrinsic corresponds to the \c VPHADDW instruction. 837 /// 838 /// \param __a 839 /// A 256-bit vector of [16 x i16] containing one of the source operands. 840 /// \param __b 841 /// A 256-bit vector of [16 x i16] containing one of the source operands. 842 /// \returns A 256-bit vector of [16 x i16] containing the sums. 843 static __inline__ __m256i __DEFAULT_FN_ATTRS256 844 _mm256_hadd_epi16(__m256i __a, __m256i __b) 845 { 846 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 847 } 848 849 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit 850 /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an 851 /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a 852 /// are returned in the lower 64 bits of each 128-bit half of the result; 853 /// sums from \a __b are returned in the upper 64 bits of each 128-bit half 854 /// of the result. 855 /// 856 /// \code{.operation} 857 /// FOR i := 0 TO 1 858 /// j := i*128 859 /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32] 860 /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96] 861 /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32] 862 /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96] 863 /// ENDFOR 864 /// \endcode 865 /// 866 /// \headerfile <immintrin.h> 867 /// 868 /// This intrinsic corresponds to the \c VPHADDD instruction. 869 /// 870 /// \param __a 871 /// A 256-bit vector of [8 x i32] containing one of the source operands. 872 /// \param __b 873 /// A 256-bit vector of [8 x i32] containing one of the source operands. 874 /// \returns A 256-bit vector of [8 x i32] containing the sums. 875 static __inline__ __m256i __DEFAULT_FN_ATTRS256 876 _mm256_hadd_epi32(__m256i __a, __m256i __b) 877 { 878 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 879 } 880 881 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 882 /// vectors of [16 x i16] using signed saturation and returns each sum in 883 /// an element of the [16 x i16] result. Sums from \a __a are returned in 884 /// the lower 64 bits of each 128-bit half of the result; sums from \a __b 885 /// are returned in the upper 64 bits of each 128-bit half of the result. 886 /// 887 /// \code{.operation} 888 /// FOR i := 0 TO 1 889 /// j := i*128 890 /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16]) 891 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48]) 892 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80]) 893 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112]) 894 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16]) 895 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48]) 896 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80]) 897 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112]) 898 /// ENDFOR 899 /// \endcode 900 /// 901 /// \headerfile <immintrin.h> 902 /// 903 /// This intrinsic corresponds to the \c VPHADDSW instruction. 904 /// 905 /// \param __a 906 /// A 256-bit vector of [16 x i16] containing one of the source operands. 907 /// \param __b 908 /// A 256-bit vector of [16 x i16] containing one of the source operands. 909 /// \returns A 256-bit vector of [16 x i16] containing the sums. 910 static __inline__ __m256i __DEFAULT_FN_ATTRS256 911 _mm256_hadds_epi16(__m256i __a, __m256i __b) 912 { 913 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 914 } 915 916 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 917 /// vectors of [16 x i16] and returns the lower 16 bits of each difference 918 /// in an element of the [16 x i16] result (overflow is ignored). 919 /// Differences from \a __a are returned in the lower 64 bits of each 920 /// 128-bit half of the result; differences from \a __b are returned in the 921 /// upper 64 bits of each 128-bit half of the result. 922 /// 923 /// \code{.operation} 924 /// FOR i := 0 TO 1 925 /// j := i*128 926 /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16] 927 /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48] 928 /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80] 929 /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112] 930 /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16] 931 /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48] 932 /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80] 933 /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112] 934 /// ENDFOR 935 /// \endcode 936 /// 937 /// \headerfile <immintrin.h> 938 /// 939 /// This intrinsic corresponds to the \c VPHSUBW instruction. 940 /// 941 /// \param __a 942 /// A 256-bit vector of [16 x i16] containing one of the source operands. 943 /// \param __b 944 /// A 256-bit vector of [16 x i16] containing one of the source operands. 945 /// \returns A 256-bit vector of [16 x i16] containing the differences. 946 static __inline__ __m256i __DEFAULT_FN_ATTRS256 947 _mm256_hsub_epi16(__m256i __a, __m256i __b) 948 { 949 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 950 } 951 952 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit 953 /// vectors of [8 x i32] and returns the lower 32 bits of each difference in 954 /// an element of the [8 x i32] result (overflow is ignored). Differences 955 /// from \a __a are returned in the lower 64 bits of each 128-bit half of 956 /// the result; differences from \a __b are returned in the upper 64 bits 957 /// of each 128-bit half of the result. 958 /// 959 /// \code{.operation} 960 /// FOR i := 0 TO 1 961 /// j := i*128 962 /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32] 963 /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96] 964 /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32] 965 /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96] 966 /// ENDFOR 967 /// \endcode 968 /// 969 /// \headerfile <immintrin.h> 970 /// 971 /// This intrinsic corresponds to the \c VPHSUBD instruction. 972 /// 973 /// \param __a 974 /// A 256-bit vector of [8 x i32] containing one of the source operands. 975 /// \param __b 976 /// A 256-bit vector of [8 x i32] containing one of the source operands. 977 /// \returns A 256-bit vector of [8 x i32] containing the differences. 978 static __inline__ __m256i __DEFAULT_FN_ATTRS256 979 _mm256_hsub_epi32(__m256i __a, __m256i __b) 980 { 981 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 982 } 983 984 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 985 /// vectors of [16 x i16] using signed saturation and returns each sum in 986 /// an element of the [16 x i16] result. Differences from \a __a are 987 /// returned in the lower 64 bits of each 128-bit half of the result; 988 /// differences from \a __b are returned in the upper 64 bits of each 989 /// 128-bit half of the result. 990 /// 991 /// \code{.operation} 992 /// FOR i := 0 TO 1 993 /// j := i*128 994 /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16]) 995 /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48]) 996 /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80]) 997 /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112]) 998 /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16]) 999 /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48]) 1000 /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80]) 1001 /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112]) 1002 /// ENDFOR 1003 /// \endcode 1004 /// 1005 /// \headerfile <immintrin.h> 1006 /// 1007 /// This intrinsic corresponds to the \c VPHSUBSW instruction. 1008 /// 1009 /// \param __a 1010 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1011 /// \param __b 1012 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1013 /// \returns A 256-bit vector of [16 x i16] containing the differences. 1014 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1015 _mm256_hsubs_epi16(__m256i __a, __m256i __b) 1016 { 1017 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 1018 } 1019 1020 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a 1021 /// with the corresponding signed byte from the 256-bit integer vector in 1022 /// \a __b, forming signed 16-bit intermediate products. Adds adjacent 1023 /// pairs of those products using signed saturation to form 16-bit sums 1024 /// returned as elements of the [16 x i16] result. 1025 /// 1026 /// \code{.operation} 1027 /// FOR i := 0 TO 15 1028 /// j := i*16 1029 /// temp1 := __a[j+7:j] * __b[j+7:j] 1030 /// temp2 := __a[j+15:j+8] * __b[j+15:j+8] 1031 /// result[j+15:j] := SATURATE16(temp1 + temp2) 1032 /// ENDFOR 1033 /// \endcode 1034 /// 1035 /// \headerfile <immintrin.h> 1036 /// 1037 /// This intrinsic corresponds to the \c VPMADDUBSW instruction. 1038 /// 1039 /// \param __a 1040 /// A 256-bit vector containing one of the source operands. 1041 /// \param __b 1042 /// A 256-bit vector containing one of the source operands. 1043 /// \returns A 256-bit vector of [16 x i16] containing the result. 1044 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1045 _mm256_maddubs_epi16(__m256i __a, __m256i __b) 1046 { 1047 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 1048 } 1049 1050 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of 1051 /// [16 x i16], forming 32-bit intermediate products, and adds pairs of 1052 /// those products to form 32-bit sums returned as elements of the 1053 /// [8 x i32] result. 1054 /// 1055 /// There is only one wraparound case: when all four of the 16-bit sources 1056 /// are \c 0x8000, the result will be \c 0x80000000. 1057 /// 1058 /// \code{.operation} 1059 /// FOR i := 0 TO 7 1060 /// j := i*32 1061 /// temp1 := __a[j+15:j] * __b[j+15:j] 1062 /// temp2 := __a[j+31:j+16] * __b[j+31:j+16] 1063 /// result[j+31:j] := temp1 + temp2 1064 /// ENDFOR 1065 /// \endcode 1066 /// 1067 /// \headerfile <immintrin.h> 1068 /// 1069 /// This intrinsic corresponds to the \c VPMADDWD instruction. 1070 /// 1071 /// \param __a 1072 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1073 /// \param __b 1074 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1075 /// \returns A 256-bit vector of [8 x i32] containing the result. 1076 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1077 _mm256_madd_epi16(__m256i __a, __m256i __b) 1078 { 1079 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 1080 } 1081 1082 /// Compares the corresponding signed bytes in the two 256-bit integer vectors 1083 /// in \a __a and \a __b and returns the larger of each pair in the 1084 /// corresponding byte of the 256-bit result. 1085 /// 1086 /// \headerfile <immintrin.h> 1087 /// 1088 /// This intrinsic corresponds to the \c VPMAXSB instruction. 1089 /// 1090 /// \param __a 1091 /// A 256-bit integer vector. 1092 /// \param __b 1093 /// A 256-bit integer vector. 1094 /// \returns A 256-bit integer vector containing the result. 1095 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1096 _mm256_max_epi8(__m256i __a, __m256i __b) 1097 { 1098 return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); 1099 } 1100 1101 /// Compares the corresponding signed 16-bit integers in the two 256-bit 1102 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1103 /// each pair in the corresponding element of the 256-bit result. 1104 /// 1105 /// \headerfile <immintrin.h> 1106 /// 1107 /// This intrinsic corresponds to the \c VPMAXSW instruction. 1108 /// 1109 /// \param __a 1110 /// A 256-bit vector of [16 x i16]. 1111 /// \param __b 1112 /// A 256-bit vector of [16 x i16]. 1113 /// \returns A 256-bit vector of [16 x i16] containing the result. 1114 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1115 _mm256_max_epi16(__m256i __a, __m256i __b) 1116 { 1117 return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); 1118 } 1119 1120 /// Compares the corresponding signed 32-bit integers in the two 256-bit 1121 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1122 /// each pair in the corresponding element of the 256-bit result. 1123 /// 1124 /// \headerfile <immintrin.h> 1125 /// 1126 /// This intrinsic corresponds to the \c VPMAXSD instruction. 1127 /// 1128 /// \param __a 1129 /// A 256-bit vector of [8 x i32]. 1130 /// \param __b 1131 /// A 256-bit vector of [8 x i32]. 1132 /// \returns A 256-bit vector of [8 x i32] containing the result. 1133 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1134 _mm256_max_epi32(__m256i __a, __m256i __b) 1135 { 1136 return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); 1137 } 1138 1139 /// Compares the corresponding unsigned bytes in the two 256-bit integer 1140 /// vectors in \a __a and \a __b and returns the larger of each pair in 1141 /// the corresponding byte of the 256-bit result. 1142 /// 1143 /// \headerfile <immintrin.h> 1144 /// 1145 /// This intrinsic corresponds to the \c VPMAXUB instruction. 1146 /// 1147 /// \param __a 1148 /// A 256-bit integer vector. 1149 /// \param __b 1150 /// A 256-bit integer vector. 1151 /// \returns A 256-bit integer vector containing the result. 1152 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1153 _mm256_max_epu8(__m256i __a, __m256i __b) 1154 { 1155 return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); 1156 } 1157 1158 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1159 /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 1160 /// each pair in the corresponding element of the 256-bit result. 1161 /// 1162 /// \headerfile <immintrin.h> 1163 /// 1164 /// This intrinsic corresponds to the \c VPMAXUW instruction. 1165 /// 1166 /// \param __a 1167 /// A 256-bit vector of [16 x i16]. 1168 /// \param __b 1169 /// A 256-bit vector of [16 x i16]. 1170 /// \returns A 256-bit vector of [16 x i16] containing the result. 1171 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1172 _mm256_max_epu16(__m256i __a, __m256i __b) 1173 { 1174 return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); 1175 } 1176 1177 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1178 /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 1179 /// each pair in the corresponding element of the 256-bit result. 1180 /// 1181 /// \headerfile <immintrin.h> 1182 /// 1183 /// This intrinsic corresponds to the \c VPMAXUD instruction. 1184 /// 1185 /// \param __a 1186 /// A 256-bit vector of [8 x i32]. 1187 /// \param __b 1188 /// A 256-bit vector of [8 x i32]. 1189 /// \returns A 256-bit vector of [8 x i32] containing the result. 1190 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1191 _mm256_max_epu32(__m256i __a, __m256i __b) 1192 { 1193 return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); 1194 } 1195 1196 /// Compares the corresponding signed bytes in the two 256-bit integer vectors 1197 /// in \a __a and \a __b and returns the smaller of each pair in the 1198 /// corresponding byte of the 256-bit result. 1199 /// 1200 /// \headerfile <immintrin.h> 1201 /// 1202 /// This intrinsic corresponds to the \c VPMINSB instruction. 1203 /// 1204 /// \param __a 1205 /// A 256-bit integer vector. 1206 /// \param __b 1207 /// A 256-bit integer vector. 1208 /// \returns A 256-bit integer vector containing the result. 1209 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1210 _mm256_min_epi8(__m256i __a, __m256i __b) 1211 { 1212 return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); 1213 } 1214 1215 /// Compares the corresponding signed 16-bit integers in the two 256-bit 1216 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1217 /// each pair in the corresponding element of the 256-bit result. 1218 /// 1219 /// \headerfile <immintrin.h> 1220 /// 1221 /// This intrinsic corresponds to the \c VPMINSW instruction. 1222 /// 1223 /// \param __a 1224 /// A 256-bit vector of [16 x i16]. 1225 /// \param __b 1226 /// A 256-bit vector of [16 x i16]. 1227 /// \returns A 256-bit vector of [16 x i16] containing the result. 1228 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1229 _mm256_min_epi16(__m256i __a, __m256i __b) 1230 { 1231 return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); 1232 } 1233 1234 /// Compares the corresponding signed 32-bit integers in the two 256-bit 1235 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1236 /// each pair in the corresponding element of the 256-bit result. 1237 /// 1238 /// \headerfile <immintrin.h> 1239 /// 1240 /// This intrinsic corresponds to the \c VPMINSD instruction. 1241 /// 1242 /// \param __a 1243 /// A 256-bit vector of [8 x i32]. 1244 /// \param __b 1245 /// A 256-bit vector of [8 x i32]. 1246 /// \returns A 256-bit vector of [8 x i32] containing the result. 1247 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1248 _mm256_min_epi32(__m256i __a, __m256i __b) 1249 { 1250 return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); 1251 } 1252 1253 /// Compares the corresponding unsigned bytes in the two 256-bit integer 1254 /// vectors in \a __a and \a __b and returns the smaller of each pair in 1255 /// the corresponding byte of the 256-bit result. 1256 /// 1257 /// \headerfile <immintrin.h> 1258 /// 1259 /// This intrinsic corresponds to the \c VPMINUB instruction. 1260 /// 1261 /// \param __a 1262 /// A 256-bit integer vector. 1263 /// \param __b 1264 /// A 256-bit integer vector. 1265 /// \returns A 256-bit integer vector containing the result. 1266 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1267 _mm256_min_epu8(__m256i __a, __m256i __b) 1268 { 1269 return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); 1270 } 1271 1272 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 1273 /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 1274 /// each pair in the corresponding element of the 256-bit result. 1275 /// 1276 /// \headerfile <immintrin.h> 1277 /// 1278 /// This intrinsic corresponds to the \c VPMINUW instruction. 1279 /// 1280 /// \param __a 1281 /// A 256-bit vector of [16 x i16]. 1282 /// \param __b 1283 /// A 256-bit vector of [16 x i16]. 1284 /// \returns A 256-bit vector of [16 x i16] containing the result. 1285 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1286 _mm256_min_epu16(__m256i __a, __m256i __b) 1287 { 1288 return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); 1289 } 1290 1291 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 1292 /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 1293 /// each pair in the corresponding element of the 256-bit result. 1294 /// 1295 /// \headerfile <immintrin.h> 1296 /// 1297 /// This intrinsic corresponds to the \c VPMINUD instruction. 1298 /// 1299 /// \param __a 1300 /// A 256-bit vector of [8 x i32]. 1301 /// \param __b 1302 /// A 256-bit vector of [8 x i32]. 1303 /// \returns A 256-bit vector of [8 x i32] containing the result. 1304 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1305 _mm256_min_epu32(__m256i __a, __m256i __b) 1306 { 1307 return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); 1308 } 1309 1310 static __inline__ int __DEFAULT_FN_ATTRS256 1311 _mm256_movemask_epi8(__m256i __a) 1312 { 1313 return __builtin_ia32_pmovmskb256((__v32qi)__a); 1314 } 1315 1316 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns 1317 /// the 16-bit values in the corresponding elements of a 256-bit vector 1318 /// of [16 x i16]. 1319 /// 1320 /// \code{.operation} 1321 /// FOR i := 0 TO 15 1322 /// j := i*8 1323 /// k := i*16 1324 /// result[k+15:k] := SignExtend(__V[j+7:j]) 1325 /// ENDFOR 1326 /// \endcode 1327 /// 1328 /// \headerfile <immintrin.h> 1329 /// 1330 /// This intrinsic corresponds to the \c VPMOVSXBW instruction. 1331 /// 1332 /// \param __V 1333 /// A 128-bit integer vector containing the source bytes. 1334 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended 1335 /// values. 1336 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1337 _mm256_cvtepi8_epi16(__m128i __V) 1338 { 1339 /* This function always performs a signed extension, but __v16qi is a char 1340 which may be signed or unsigned, so use __v16qs. */ 1341 return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); 1342 } 1343 1344 /// Sign-extends bytes from the lower half of the 128-bit integer vector in 1345 /// \a __V and returns the 32-bit values in the corresponding elements of a 1346 /// 256-bit vector of [8 x i32]. 1347 /// 1348 /// \code{.operation} 1349 /// FOR i := 0 TO 7 1350 /// j := i*8 1351 /// k := i*32 1352 /// result[k+31:k] := SignExtend(__V[j+7:j]) 1353 /// ENDFOR 1354 /// \endcode 1355 /// 1356 /// \headerfile <immintrin.h> 1357 /// 1358 /// This intrinsic corresponds to the \c VPMOVSXBD instruction. 1359 /// 1360 /// \param __V 1361 /// A 128-bit integer vector containing the source bytes. 1362 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1363 /// values. 1364 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1365 _mm256_cvtepi8_epi32(__m128i __V) 1366 { 1367 /* This function always performs a signed extension, but __v16qi is a char 1368 which may be signed or unsigned, so use __v16qs. */ 1369 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1370 } 1371 1372 /// Sign-extends the first four bytes from the 128-bit integer vector in 1373 /// \a __V and returns the 64-bit values in the corresponding elements of a 1374 /// 256-bit vector of [4 x i64]. 1375 /// 1376 /// \code{.operation} 1377 /// result[63:0] := SignExtend(__V[7:0]) 1378 /// result[127:64] := SignExtend(__V[15:8]) 1379 /// result[191:128] := SignExtend(__V[23:16]) 1380 /// result[255:192] := SignExtend(__V[31:24]) 1381 /// \endcode 1382 /// 1383 /// \headerfile <immintrin.h> 1384 /// 1385 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction. 1386 /// 1387 /// \param __V 1388 /// A 128-bit integer vector containing the source bytes. 1389 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1390 /// values. 1391 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1392 _mm256_cvtepi8_epi64(__m128i __V) 1393 { 1394 /* This function always performs a signed extension, but __v16qi is a char 1395 which may be signed or unsigned, so use __v16qs. */ 1396 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); 1397 } 1398 1399 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1400 /// \a __V and returns the 32-bit values in the corresponding elements of a 1401 /// 256-bit vector of [8 x i32]. 1402 /// 1403 /// \code{.operation} 1404 /// FOR i := 0 TO 7 1405 /// j := i*16 1406 /// k := i*32 1407 /// result[k+31:k] := SignExtend(__V[j+15:j]) 1408 /// ENDFOR 1409 /// \endcode 1410 /// 1411 /// \headerfile <immintrin.h> 1412 /// 1413 /// This intrinsic corresponds to the \c VPMOVSXWD instruction. 1414 /// 1415 /// \param __V 1416 /// A 128-bit vector of [8 x i16] containing the source values. 1417 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 1418 /// values. 1419 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1420 _mm256_cvtepi16_epi32(__m128i __V) 1421 { 1422 return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); 1423 } 1424 1425 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of 1426 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1427 /// elements of a 256-bit vector of [4 x i64]. 1428 /// 1429 /// \code{.operation} 1430 /// result[63:0] := SignExtend(__V[15:0]) 1431 /// result[127:64] := SignExtend(__V[31:16]) 1432 /// result[191:128] := SignExtend(__V[47:32]) 1433 /// result[255:192] := SignExtend(__V[64:48]) 1434 /// \endcode 1435 /// 1436 /// \headerfile <immintrin.h> 1437 /// 1438 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1439 /// 1440 /// \param __V 1441 /// A 128-bit vector of [8 x i16] containing the source values. 1442 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1443 /// values. 1444 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1445 _mm256_cvtepi16_epi64(__m128i __V) 1446 { 1447 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); 1448 } 1449 1450 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1451 /// \a __V and returns the 64-bit values in the corresponding elements of a 1452 /// 256-bit vector of [4 x i64]. 1453 /// 1454 /// \code{.operation} 1455 /// result[63:0] := SignExtend(__V[31:0]) 1456 /// result[127:64] := SignExtend(__V[63:32]) 1457 /// result[191:128] := SignExtend(__V[95:64]) 1458 /// result[255:192] := SignExtend(__V[127:96]) 1459 /// \endcode 1460 /// 1461 /// \headerfile <immintrin.h> 1462 /// 1463 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction. 1464 /// 1465 /// \param __V 1466 /// A 128-bit vector of [4 x i32] containing the source values. 1467 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 1468 /// values. 1469 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1470 _mm256_cvtepi32_epi64(__m128i __V) 1471 { 1472 return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); 1473 } 1474 1475 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns 1476 /// the 16-bit values in the corresponding elements of a 256-bit vector 1477 /// of [16 x i16]. 1478 /// 1479 /// \code{.operation} 1480 /// FOR i := 0 TO 15 1481 /// j := i*8 1482 /// k := i*16 1483 /// result[k+15:k] := ZeroExtend(__V[j+7:j]) 1484 /// ENDFOR 1485 /// \endcode 1486 /// 1487 /// \headerfile <immintrin.h> 1488 /// 1489 /// This intrinsic corresponds to the \c VPMOVZXBW instruction. 1490 /// 1491 /// \param __V 1492 /// A 128-bit integer vector containing the source bytes. 1493 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended 1494 /// values. 1495 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1496 _mm256_cvtepu8_epi16(__m128i __V) 1497 { 1498 return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); 1499 } 1500 1501 /// Zero-extends bytes from the lower half of the 128-bit integer vector in 1502 /// \a __V and returns the 32-bit values in the corresponding elements of a 1503 /// 256-bit vector of [8 x i32]. 1504 /// 1505 /// \code{.operation} 1506 /// FOR i := 0 TO 7 1507 /// j := i*8 1508 /// k := i*32 1509 /// result[k+31:k] := ZeroExtend(__V[j+7:j]) 1510 /// ENDFOR 1511 /// \endcode 1512 /// 1513 /// \headerfile <immintrin.h> 1514 /// 1515 /// This intrinsic corresponds to the \c VPMOVZXBD instruction. 1516 /// 1517 /// \param __V 1518 /// A 128-bit integer vector containing the source bytes. 1519 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1520 /// values. 1521 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1522 _mm256_cvtepu8_epi32(__m128i __V) 1523 { 1524 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 1525 } 1526 1527 /// Zero-extends the first four bytes from the 128-bit integer vector in 1528 /// \a __V and returns the 64-bit values in the corresponding elements of a 1529 /// 256-bit vector of [4 x i64]. 1530 /// 1531 /// \code{.operation} 1532 /// result[63:0] := ZeroExtend(__V[7:0]) 1533 /// result[127:64] := ZeroExtend(__V[15:8]) 1534 /// result[191:128] := ZeroExtend(__V[23:16]) 1535 /// result[255:192] := ZeroExtend(__V[31:24]) 1536 /// \endcode 1537 /// 1538 /// \headerfile <immintrin.h> 1539 /// 1540 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction. 1541 /// 1542 /// \param __V 1543 /// A 128-bit integer vector containing the source bytes. 1544 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1545 /// values. 1546 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1547 _mm256_cvtepu8_epi64(__m128i __V) 1548 { 1549 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); 1550 } 1551 1552 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in 1553 /// \a __V and returns the 32-bit values in the corresponding elements of a 1554 /// 256-bit vector of [8 x i32]. 1555 /// 1556 /// \code{.operation} 1557 /// FOR i := 0 TO 7 1558 /// j := i*16 1559 /// k := i*32 1560 /// result[k+31:k] := ZeroExtend(__V[j+15:j]) 1561 /// ENDFOR 1562 /// \endcode 1563 /// 1564 /// \headerfile <immintrin.h> 1565 /// 1566 /// This intrinsic corresponds to the \c VPMOVZXWD instruction. 1567 /// 1568 /// \param __V 1569 /// A 128-bit vector of [8 x i16] containing the source values. 1570 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 1571 /// values. 1572 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1573 _mm256_cvtepu16_epi32(__m128i __V) 1574 { 1575 return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); 1576 } 1577 1578 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of 1579 /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 1580 /// elements of a 256-bit vector of [4 x i64]. 1581 /// 1582 /// \code{.operation} 1583 /// result[63:0] := ZeroExtend(__V[15:0]) 1584 /// result[127:64] := ZeroExtend(__V[31:16]) 1585 /// result[191:128] := ZeroExtend(__V[47:32]) 1586 /// result[255:192] := ZeroExtend(__V[64:48]) 1587 /// \endcode 1588 /// 1589 /// \headerfile <immintrin.h> 1590 /// 1591 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 1592 /// 1593 /// \param __V 1594 /// A 128-bit vector of [8 x i16] containing the source values. 1595 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1596 /// values. 1597 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1598 _mm256_cvtepu16_epi64(__m128i __V) 1599 { 1600 return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); 1601 } 1602 1603 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in 1604 /// \a __V and returns the 64-bit values in the corresponding elements of a 1605 /// 256-bit vector of [4 x i64]. 1606 /// 1607 /// \code{.operation} 1608 /// result[63:0] := ZeroExtend(__V[31:0]) 1609 /// result[127:64] := ZeroExtend(__V[63:32]) 1610 /// result[191:128] := ZeroExtend(__V[95:64]) 1611 /// result[255:192] := ZeroExtend(__V[127:96]) 1612 /// \endcode 1613 /// 1614 /// \headerfile <immintrin.h> 1615 /// 1616 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction. 1617 /// 1618 /// \param __V 1619 /// A 128-bit vector of [4 x i32] containing the source values. 1620 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 1621 /// values. 1622 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1623 _mm256_cvtepu32_epi64(__m128i __V) 1624 { 1625 return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); 1626 } 1627 1628 /// Multiplies signed 32-bit integers from even-numbered elements of two 1629 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1630 /// [4 x i64] result. 1631 /// 1632 /// \code{.operation} 1633 /// result[63:0] := __a[31:0] * __b[31:0] 1634 /// result[127:64] := __a[95:64] * __b[95:64] 1635 /// result[191:128] := __a[159:128] * __b[159:128] 1636 /// result[255:192] := __a[223:192] * __b[223:192] 1637 /// \endcode 1638 /// 1639 /// \headerfile <immintrin.h> 1640 /// 1641 /// This intrinsic corresponds to the \c VPMULDQ instruction. 1642 /// 1643 /// \param __a 1644 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1645 /// \param __b 1646 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1647 /// \returns A 256-bit vector of [4 x i64] containing the products. 1648 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1649 _mm256_mul_epi32(__m256i __a, __m256i __b) 1650 { 1651 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 1652 } 1653 1654 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1655 /// [16 x i16], truncates the 32-bit results to the most significant 18 1656 /// bits, rounds by adding 1, and returns bits [16:1] of each rounded 1657 /// product in the [16 x i16] result. 1658 /// 1659 /// \code{.operation} 1660 /// FOR i := 0 TO 15 1661 /// j := i*16 1662 /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1 1663 /// result[j+15:j] := temp[16:1] 1664 /// \endcode 1665 /// 1666 /// \headerfile <immintrin.h> 1667 /// 1668 /// This intrinsic corresponds to the \c VPMULHRSW instruction. 1669 /// 1670 /// \param __a 1671 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1672 /// \param __b 1673 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1674 /// \returns A 256-bit vector of [16 x i16] containing the rounded products. 1675 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1676 _mm256_mulhrs_epi16(__m256i __a, __m256i __b) 1677 { 1678 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 1679 } 1680 1681 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of 1682 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1683 /// [16 x i16] result. 1684 /// 1685 /// \headerfile <immintrin.h> 1686 /// 1687 /// This intrinsic corresponds to the \c VPMULHUW instruction. 1688 /// 1689 /// \param __a 1690 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1691 /// \param __b 1692 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1693 /// \returns A 256-bit vector of [16 x i16] containing the products. 1694 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1695 _mm256_mulhi_epu16(__m256i __a, __m256i __b) 1696 { 1697 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 1698 } 1699 1700 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1701 /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 1702 /// [16 x i16] result. 1703 /// 1704 /// \headerfile <immintrin.h> 1705 /// 1706 /// This intrinsic corresponds to the \c VPMULHW instruction. 1707 /// 1708 /// \param __a 1709 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1710 /// \param __b 1711 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1712 /// \returns A 256-bit vector of [16 x i16] containing the products. 1713 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1714 _mm256_mulhi_epi16(__m256i __a, __m256i __b) 1715 { 1716 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 1717 } 1718 1719 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 1720 /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the 1721 /// [16 x i16] result. 1722 /// 1723 /// \headerfile <immintrin.h> 1724 /// 1725 /// This intrinsic corresponds to the \c VPMULLW instruction. 1726 /// 1727 /// \param __a 1728 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1729 /// \param __b 1730 /// A 256-bit vector of [16 x i16] containing one of the source operands. 1731 /// \returns A 256-bit vector of [16 x i16] containing the products. 1732 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1733 _mm256_mullo_epi16(__m256i __a, __m256i __b) 1734 { 1735 return (__m256i)((__v16hu)__a * (__v16hu)__b); 1736 } 1737 1738 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of 1739 /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the 1740 /// [8 x i32] result. 1741 /// 1742 /// \headerfile <immintrin.h> 1743 /// 1744 /// This intrinsic corresponds to the \c VPMULLD instruction. 1745 /// 1746 /// \param __a 1747 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1748 /// \param __b 1749 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1750 /// \returns A 256-bit vector of [8 x i32] containing the products. 1751 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1752 _mm256_mullo_epi32 (__m256i __a, __m256i __b) 1753 { 1754 return (__m256i)((__v8su)__a * (__v8su)__b); 1755 } 1756 1757 /// Multiplies unsigned 32-bit integers from even-numered elements of two 1758 /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 1759 /// [4 x i64] result. 1760 /// 1761 /// \code{.operation} 1762 /// result[63:0] := __a[31:0] * __b[31:0] 1763 /// result[127:64] := __a[95:64] * __b[95:64] 1764 /// result[191:128] := __a[159:128] * __b[159:128] 1765 /// result[255:192] := __a[223:192] * __b[223:192] 1766 /// \endcode 1767 /// 1768 /// \headerfile <immintrin.h> 1769 /// 1770 /// This intrinsic corresponds to the \c VPMULUDQ instruction. 1771 /// 1772 /// \param __a 1773 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1774 /// \param __b 1775 /// A 256-bit vector of [8 x i32] containing one of the source operands. 1776 /// \returns A 256-bit vector of [4 x i64] containing the products. 1777 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1778 _mm256_mul_epu32(__m256i __a, __m256i __b) 1779 { 1780 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 1781 } 1782 1783 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and 1784 /// \a __b. 1785 /// 1786 /// \headerfile <immintrin.h> 1787 /// 1788 /// This intrinsic corresponds to the \c VPOR instruction. 1789 /// 1790 /// \param __a 1791 /// A 256-bit integer vector. 1792 /// \param __b 1793 /// A 256-bit integer vector. 1794 /// \returns A 256-bit integer vector containing the result. 1795 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1796 _mm256_or_si256(__m256i __a, __m256i __b) 1797 { 1798 return (__m256i)((__v4du)__a | (__v4du)__b); 1799 } 1800 1801 /// Computes four sum of absolute difference (SAD) operations on sets of eight 1802 /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and 1803 /// \a __b. 1804 /// 1805 /// One SAD result is computed for each set of eight bytes from \a __a and 1806 /// eight bytes from \a __b. The zero-extended SAD value is returned in the 1807 /// corresponding 64-bit element of the result. 1808 /// 1809 /// A single SAD operation takes the differences between the corresponding 1810 /// bytes of \a __a and \a __b, takes the absolute value of each difference, 1811 /// and sums these eight values to form one 16-bit result. This operation 1812 /// is repeated four times with successive sets of eight bytes. 1813 /// 1814 /// \code{.operation} 1815 /// FOR i := 0 TO 3 1816 /// j := i*64 1817 /// temp0 := ABS(__a[j+7:j] - __b[j+7:j]) 1818 /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8]) 1819 /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16]) 1820 /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24]) 1821 /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32]) 1822 /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40]) 1823 /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48]) 1824 /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56]) 1825 /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 + 1826 /// temp4 + temp5 + temp6 + temp7 1827 /// result[j+63:j+16] := 0 1828 /// ENDFOR 1829 /// \endcode 1830 /// 1831 /// \headerfile <immintrin.h> 1832 /// 1833 /// This intrinsic corresponds to the \c VPSADBW instruction. 1834 /// 1835 /// \param __a 1836 /// A 256-bit integer vector. 1837 /// \param __b 1838 /// A 256-bit integer vector. 1839 /// \returns A 256-bit integer vector containing the result. 1840 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1841 _mm256_sad_epu8(__m256i __a, __m256i __b) 1842 { 1843 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 1844 } 1845 1846 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according 1847 /// to control information in the 256-bit integer vector \a __b, and 1848 /// returns the 256-bit result. In effect there are two separate 128-bit 1849 /// shuffles in the lower and upper halves. 1850 /// 1851 /// \code{.operation} 1852 /// FOR i := 0 TO 31 1853 /// j := i*8 1854 /// IF __b[j+7] == 1 1855 /// result[j+7:j] := 0 1856 /// ELSE 1857 /// k := __b[j+3:j] * 8 1858 /// IF i > 15 1859 /// k := k + 128 1860 /// FI 1861 /// result[j+7:j] := __a[k+7:k] 1862 /// FI 1863 /// ENDFOR 1864 /// \endcode 1865 /// 1866 /// \headerfile <immintrin.h> 1867 /// 1868 /// This intrinsic corresponds to the \c VPSHUFB instruction. 1869 /// 1870 /// \param __a 1871 /// A 256-bit integer vector containing source values. 1872 /// \param __b 1873 /// A 256-bit integer vector containing control information to determine 1874 /// what goes into the corresponding byte of the result. If bit 7 of the 1875 /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the 1876 /// control byte specify the index (within the same 128-bit half) of \a __a 1877 /// to copy to the result byte. 1878 /// \returns A 256-bit integer vector containing the result. 1879 static __inline__ __m256i __DEFAULT_FN_ATTRS256 1880 _mm256_shuffle_epi8(__m256i __a, __m256i __b) 1881 { 1882 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 1883 } 1884 1885 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a 1886 /// according to control information in the integer literal \a imm, and 1887 /// returns the 256-bit result. In effect there are two parallel 128-bit 1888 /// shuffles in the lower and upper halves. 1889 /// 1890 /// \code{.operation} 1891 /// FOR i := 0 to 3 1892 /// j := i*32 1893 /// k := (imm >> i*2)[1:0] * 32 1894 /// result[j+31:j] := a[k+31:k] 1895 /// result[128+j+31:128+j] := a[128+k+31:128+k] 1896 /// ENDFOR 1897 /// \endcode 1898 /// 1899 /// \headerfile <immintrin.h> 1900 /// 1901 /// \code 1902 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm); 1903 /// \endcode 1904 /// 1905 /// This intrinsic corresponds to the \c VPSHUFB instruction. 1906 /// 1907 /// \param a 1908 /// A 256-bit vector of [8 x i32] containing source values. 1909 /// \param imm 1910 /// An immediate 8-bit value specifying which elements to copy from \a a. 1911 /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the 1912 /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so 1913 /// forth. 1914 /// \returns A 256-bit vector of [8 x i32] containing the result. 1915 #define _mm256_shuffle_epi32(a, imm) \ 1916 ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) 1917 1918 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a 1919 /// according to control information in the integer literal \a imm, and 1920 /// returns the 256-bit result. The upper 64 bits of each 128-bit half 1921 /// are shuffled in parallel; the lower 64 bits of each 128-bit half are 1922 /// copied from \a a unchanged. 1923 /// 1924 /// \code{.operation} 1925 /// result[63:0] := a[63:0] 1926 /// result[191:128] := a[191:128] 1927 /// FOR i := 0 TO 3 1928 /// j := i * 16 + 64 1929 /// k := (imm >> i*2)[1:0] * 16 + 64 1930 /// result[j+15:j] := a[k+15:k] 1931 /// result[128+j+15:128+j] := a[128+k+15:128+k] 1932 /// ENDFOR 1933 /// \endcode 1934 /// 1935 /// \headerfile <immintrin.h> 1936 /// 1937 /// \code 1938 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm); 1939 /// \endcode 1940 /// 1941 /// This intrinsic corresponds to the \c VPSHUFHW instruction. 1942 /// 1943 /// \param a 1944 /// A 256-bit vector of [16 x i16] containing source values. 1945 /// \param imm 1946 /// An immediate 8-bit value specifying which elements to copy from \a a. 1947 /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the 1948 /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so 1949 /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth). 1950 /// \returns A 256-bit vector of [16 x i16] containing the result. 1951 #define _mm256_shufflehi_epi16(a, imm) \ 1952 ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) 1953 1954 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a 1955 /// according to control information in the integer literal \a imm, and 1956 /// returns the 256-bit [16 x i16] result. The lower 64 bits of each 1957 /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are 1958 /// copied from \a a unchanged. 1959 /// 1960 /// \code{.operation} 1961 /// result[127:64] := a[127:64] 1962 /// result[255:192] := a[255:192] 1963 /// FOR i := 0 TO 3 1964 /// j := i * 16 1965 /// k := (imm >> i*2)[1:0] * 16 1966 /// result[j+15:j] := a[k+15:k] 1967 /// result[128+j+15:128+j] := a[128+k+15:128+k] 1968 /// ENDFOR 1969 /// \endcode 1970 /// 1971 /// \headerfile <immintrin.h> 1972 /// 1973 /// \code 1974 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm); 1975 /// \endcode 1976 /// 1977 /// This intrinsic corresponds to the \c VPSHUFLW instruction. 1978 /// 1979 /// \param a 1980 /// A 256-bit vector of [16 x i16] to use as a source of data for the 1981 /// result. 1982 /// \param imm 1983 /// An immediate 8-bit value specifying which elements to copy from \a a. 1984 /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the 1985 /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so 1986 /// forth. 1987 /// \returns A 256-bit vector of [16 x i16] containing the result. 1988 #define _mm256_shufflelo_epi16(a, imm) \ 1989 ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) 1990 1991 /// Sets each byte of the result to the corresponding byte of the 256-bit 1992 /// integer vector in \a __a, the negative of that byte, or zero, depending 1993 /// on whether the corresponding byte of the 256-bit integer vector in 1994 /// \a __b is greater than zero, less than zero, or equal to zero, 1995 /// respectively. 1996 /// 1997 /// \headerfile <immintrin.h> 1998 /// 1999 /// This intrinsic corresponds to the \c VPSIGNB instruction. 2000 /// 2001 /// \param __a 2002 /// A 256-bit integer vector. 2003 /// \param __b 2004 /// A 256-bit integer vector]. 2005 /// \returns A 256-bit integer vector containing the result. 2006 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2007 _mm256_sign_epi8(__m256i __a, __m256i __b) 2008 { 2009 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 2010 } 2011 2012 /// Sets each element of the result to the corresponding element of the 2013 /// 256-bit vector of [16 x i16] in \a __a, the negative of that element, 2014 /// or zero, depending on whether the corresponding element of the 256-bit 2015 /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or 2016 /// equal to zero, respectively. 2017 /// 2018 /// \headerfile <immintrin.h> 2019 /// 2020 /// This intrinsic corresponds to the \c VPSIGNW instruction. 2021 /// 2022 /// \param __a 2023 /// A 256-bit vector of [16 x i16]. 2024 /// \param __b 2025 /// A 256-bit vector of [16 x i16]. 2026 /// \returns A 256-bit vector of [16 x i16] containing the result. 2027 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2028 _mm256_sign_epi16(__m256i __a, __m256i __b) 2029 { 2030 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 2031 } 2032 2033 /// Sets each element of the result to the corresponding element of the 2034 /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or 2035 /// zero, depending on whether the corresponding element of the 256-bit 2036 /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or 2037 /// equal to zero, respectively. 2038 /// 2039 /// \headerfile <immintrin.h> 2040 /// 2041 /// This intrinsic corresponds to the \c VPSIGND instruction. 2042 /// 2043 /// \param __a 2044 /// A 256-bit vector of [8 x i32]. 2045 /// \param __b 2046 /// A 256-bit vector of [8 x i32]. 2047 /// \returns A 256-bit vector of [8 x i32] containing the result. 2048 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2049 _mm256_sign_epi32(__m256i __a, __m256i __b) 2050 { 2051 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 2052 } 2053 2054 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2055 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2056 /// is greater than 15, the returned result is all zeroes. 2057 /// 2058 /// \headerfile <immintrin.h> 2059 /// 2060 /// \code 2061 /// __m256i _mm256_slli_si256(__m256i a, const int imm); 2062 /// \endcode 2063 /// 2064 /// This intrinsic corresponds to the \c VPSLLDQ instruction. 2065 /// 2066 /// \param a 2067 /// A 256-bit integer vector to be shifted. 2068 /// \param imm 2069 /// An unsigned immediate value specifying the shift count (in bytes). 2070 /// \returns A 256-bit integer vector containing the result. 2071 #define _mm256_slli_si256(a, imm) \ 2072 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2073 2074 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 2075 /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 2076 /// is greater than 15, the returned result is all zeroes. 2077 /// 2078 /// \headerfile <immintrin.h> 2079 /// 2080 /// \code 2081 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm); 2082 /// \endcode 2083 /// 2084 /// This intrinsic corresponds to the \c VPSLLDQ instruction. 2085 /// 2086 /// \param a 2087 /// A 256-bit integer vector to be shifted. 2088 /// \param imm 2089 /// An unsigned immediate value specifying the shift count (in bytes). 2090 /// \returns A 256-bit integer vector containing the result. 2091 #define _mm256_bslli_epi128(a, imm) \ 2092 ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 2093 2094 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2095 /// left by \a __count bits, shifting in zero bits, and returns the result. 2096 /// If \a __count is greater than 15, the returned result is all zeroes. 2097 /// 2098 /// \headerfile <immintrin.h> 2099 /// 2100 /// This intrinsic corresponds to the \c VPSLLW instruction. 2101 /// 2102 /// \param __a 2103 /// A 256-bit vector of [16 x i16] to be shifted. 2104 /// \param __count 2105 /// An unsigned integer value specifying the shift count (in bits). 2106 /// \returns A 256-bit vector of [16 x i16] containing the result. 2107 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2108 _mm256_slli_epi16(__m256i __a, int __count) 2109 { 2110 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 2111 } 2112 2113 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2114 /// left by the number of bits specified by the lower 64 bits of \a __count, 2115 /// shifting in zero bits, and returns the result. If \a __count is greater 2116 /// than 15, the returned result is all zeroes. 2117 /// 2118 /// \headerfile <immintrin.h> 2119 /// 2120 /// This intrinsic corresponds to the \c VPSLLW instruction. 2121 /// 2122 /// \param __a 2123 /// A 256-bit vector of [16 x i16] to be shifted. 2124 /// \param __count 2125 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2126 /// shift count (in bits). The upper element is ignored. 2127 /// \returns A 256-bit vector of [16 x i16] containing the result. 2128 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2129 _mm256_sll_epi16(__m256i __a, __m128i __count) 2130 { 2131 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 2132 } 2133 2134 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2135 /// left by \a __count bits, shifting in zero bits, and returns the result. 2136 /// If \a __count is greater than 31, the returned result is all zeroes. 2137 /// 2138 /// \headerfile <immintrin.h> 2139 /// 2140 /// This intrinsic corresponds to the \c VPSLLD instruction. 2141 /// 2142 /// \param __a 2143 /// A 256-bit vector of [8 x i32] to be shifted. 2144 /// \param __count 2145 /// An unsigned integer value specifying the shift count (in bits). 2146 /// \returns A 256-bit vector of [8 x i32] containing the result. 2147 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2148 _mm256_slli_epi32(__m256i __a, int __count) 2149 { 2150 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 2151 } 2152 2153 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2154 /// left by the number of bits given in the lower 64 bits of \a __count, 2155 /// shifting in zero bits, and returns the result. If \a __count is greater 2156 /// than 31, the returned result is all zeroes. 2157 /// 2158 /// \headerfile <immintrin.h> 2159 /// 2160 /// This intrinsic corresponds to the \c VPSLLD instruction. 2161 /// 2162 /// \param __a 2163 /// A 256-bit vector of [8 x i32] to be shifted. 2164 /// \param __count 2165 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2166 /// shift count (in bits). The upper element is ignored. 2167 /// \returns A 256-bit vector of [8 x i32] containing the result. 2168 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2169 _mm256_sll_epi32(__m256i __a, __m128i __count) 2170 { 2171 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 2172 } 2173 2174 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2175 /// left by \a __count bits, shifting in zero bits, and returns the result. 2176 /// If \a __count is greater than 63, the returned result is all zeroes. 2177 /// 2178 /// \headerfile <immintrin.h> 2179 /// 2180 /// This intrinsic corresponds to the \c VPSLLQ instruction. 2181 /// 2182 /// \param __a 2183 /// A 256-bit vector of [4 x i64] to be shifted. 2184 /// \param __count 2185 /// An unsigned integer value specifying the shift count (in bits). 2186 /// \returns A 256-bit vector of [4 x i64] containing the result. 2187 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2188 _mm256_slli_epi64(__m256i __a, int __count) 2189 { 2190 return __builtin_ia32_psllqi256((__v4di)__a, __count); 2191 } 2192 2193 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2194 /// left by the number of bits given in the lower 64 bits of \a __count, 2195 /// shifting in zero bits, and returns the result. If \a __count is greater 2196 /// than 63, the returned result is all zeroes. 2197 /// 2198 /// \headerfile <immintrin.h> 2199 /// 2200 /// This intrinsic corresponds to the \c VPSLLQ instruction. 2201 /// 2202 /// \param __a 2203 /// A 256-bit vector of [4 x i64] to be shifted. 2204 /// \param __count 2205 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2206 /// shift count (in bits). The upper element is ignored. 2207 /// \returns A 256-bit vector of [4 x i64] containing the result. 2208 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2209 _mm256_sll_epi64(__m256i __a, __m128i __count) 2210 { 2211 return __builtin_ia32_psllq256((__v4di)__a, __count); 2212 } 2213 2214 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2215 /// right by \a __count bits, shifting in sign bits, and returns the result. 2216 /// If \a __count is greater than 15, each element of the result is either 2217 /// 0 or -1 according to the corresponding input sign bit. 2218 /// 2219 /// \headerfile <immintrin.h> 2220 /// 2221 /// This intrinsic corresponds to the \c VPSRAW instruction. 2222 /// 2223 /// \param __a 2224 /// A 256-bit vector of [16 x i16] to be shifted. 2225 /// \param __count 2226 /// An unsigned integer value specifying the shift count (in bits). 2227 /// \returns A 256-bit vector of [16 x i16] containing the result. 2228 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2229 _mm256_srai_epi16(__m256i __a, int __count) 2230 { 2231 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 2232 } 2233 2234 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2235 /// right by the number of bits given in the lower 64 bits of \a __count, 2236 /// shifting in sign bits, and returns the result. If \a __count is greater 2237 /// than 15, each element of the result is either 0 or -1 according to the 2238 /// corresponding input sign bit. 2239 /// 2240 /// \headerfile <immintrin.h> 2241 /// 2242 /// This intrinsic corresponds to the \c VPSRAW instruction. 2243 /// 2244 /// \param __a 2245 /// A 256-bit vector of [16 x i16] to be shifted. 2246 /// \param __count 2247 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2248 /// shift count (in bits). The upper element is ignored. 2249 /// \returns A 256-bit vector of [16 x i16] containing the result. 2250 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2251 _mm256_sra_epi16(__m256i __a, __m128i __count) 2252 { 2253 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 2254 } 2255 2256 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2257 /// right by \a __count bits, shifting in sign bits, and returns the result. 2258 /// If \a __count is greater than 31, each element of the result is either 2259 /// 0 or -1 according to the corresponding input sign bit. 2260 /// 2261 /// \headerfile <immintrin.h> 2262 /// 2263 /// This intrinsic corresponds to the \c VPSRAD instruction. 2264 /// 2265 /// \param __a 2266 /// A 256-bit vector of [8 x i32] to be shifted. 2267 /// \param __count 2268 /// An unsigned integer value specifying the shift count (in bits). 2269 /// \returns A 256-bit vector of [8 x i32] containing the result. 2270 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2271 _mm256_srai_epi32(__m256i __a, int __count) 2272 { 2273 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 2274 } 2275 2276 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2277 /// right by the number of bits given in the lower 64 bits of \a __count, 2278 /// shifting in sign bits, and returns the result. If \a __count is greater 2279 /// than 31, each element of the result is either 0 or -1 according to the 2280 /// corresponding input sign bit. 2281 /// 2282 /// \headerfile <immintrin.h> 2283 /// 2284 /// This intrinsic corresponds to the \c VPSRAD instruction. 2285 /// 2286 /// \param __a 2287 /// A 256-bit vector of [8 x i32] to be shifted. 2288 /// \param __count 2289 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2290 /// shift count (in bits). The upper element is ignored. 2291 /// \returns A 256-bit vector of [8 x i32] containing the result. 2292 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2293 _mm256_sra_epi32(__m256i __a, __m128i __count) 2294 { 2295 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 2296 } 2297 2298 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2299 /// \a imm bytes, shifting in zero bytes, and returns the result. If 2300 /// \a imm is greater than 15, the returned result is all zeroes. 2301 /// 2302 /// \headerfile <immintrin.h> 2303 /// 2304 /// \code 2305 /// __m256i _mm256_srli_si256(__m256i a, const int imm); 2306 /// \endcode 2307 /// 2308 /// This intrinsic corresponds to the \c VPSRLDQ instruction. 2309 /// 2310 /// \param a 2311 /// A 256-bit integer vector to be shifted. 2312 /// \param imm 2313 /// An unsigned immediate value specifying the shift count (in bytes). 2314 /// \returns A 256-bit integer vector containing the result. 2315 #define _mm256_srli_si256(a, imm) \ 2316 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2317 2318 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 2319 /// \a imm bytes, shifting in zero bytes, and returns the result. If 2320 /// \a imm is greater than 15, the returned result is all zeroes. 2321 /// 2322 /// \headerfile <immintrin.h> 2323 /// 2324 /// \code 2325 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm); 2326 /// \endcode 2327 /// 2328 /// This intrinsic corresponds to the \c VPSRLDQ instruction. 2329 /// 2330 /// \param a 2331 /// A 256-bit integer vector to be shifted. 2332 /// \param imm 2333 /// An unsigned immediate value specifying the shift count (in bytes). 2334 /// \returns A 256-bit integer vector containing the result. 2335 #define _mm256_bsrli_epi128(a, imm) \ 2336 ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 2337 2338 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2339 /// right by \a __count bits, shifting in zero bits, and returns the result. 2340 /// If \a __count is greater than 15, the returned result is all zeroes. 2341 /// 2342 /// \headerfile <immintrin.h> 2343 /// 2344 /// This intrinsic corresponds to the \c VPSRLW instruction. 2345 /// 2346 /// \param __a 2347 /// A 256-bit vector of [16 x i16] to be shifted. 2348 /// \param __count 2349 /// An unsigned integer value specifying the shift count (in bits). 2350 /// \returns A 256-bit vector of [16 x i16] containing the result. 2351 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2352 _mm256_srli_epi16(__m256i __a, int __count) 2353 { 2354 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 2355 } 2356 2357 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 2358 /// right by the number of bits given in the lower 64 bits of \a __count, 2359 /// shifting in zero bits, and returns the result. If \a __count is greater 2360 /// than 15, the returned result is all zeroes. 2361 /// 2362 /// \headerfile <immintrin.h> 2363 /// 2364 /// This intrinsic corresponds to the \c VPSRLW instruction. 2365 /// 2366 /// \param __a 2367 /// A 256-bit vector of [16 x i16] to be shifted. 2368 /// \param __count 2369 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2370 /// shift count (in bits). The upper element is ignored. 2371 /// \returns A 256-bit vector of [16 x i16] containing the result. 2372 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2373 _mm256_srl_epi16(__m256i __a, __m128i __count) 2374 { 2375 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 2376 } 2377 2378 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2379 /// right by \a __count bits, shifting in zero bits, and returns the result. 2380 /// If \a __count is greater than 31, the returned result is all zeroes. 2381 /// 2382 /// \headerfile <immintrin.h> 2383 /// 2384 /// This intrinsic corresponds to the \c VPSRLD instruction. 2385 /// 2386 /// \param __a 2387 /// A 256-bit vector of [8 x i32] to be shifted. 2388 /// \param __count 2389 /// An unsigned integer value specifying the shift count (in bits). 2390 /// \returns A 256-bit vector of [8 x i32] containing the result. 2391 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2392 _mm256_srli_epi32(__m256i __a, int __count) 2393 { 2394 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 2395 } 2396 2397 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 2398 /// right by the number of bits given in the lower 64 bits of \a __count, 2399 /// shifting in zero bits, and returns the result. If \a __count is greater 2400 /// than 31, the returned result is all zeroes. 2401 /// 2402 /// \headerfile <immintrin.h> 2403 /// 2404 /// This intrinsic corresponds to the \c VPSRLD instruction. 2405 /// 2406 /// \param __a 2407 /// A 256-bit vector of [8 x i32] to be shifted. 2408 /// \param __count 2409 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2410 /// shift count (in bits). The upper element is ignored. 2411 /// \returns A 256-bit vector of [8 x i32] containing the result. 2412 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2413 _mm256_srl_epi32(__m256i __a, __m128i __count) 2414 { 2415 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 2416 } 2417 2418 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2419 /// right by \a __count bits, shifting in zero bits, and returns the result. 2420 /// If \a __count is greater than 63, the returned result is all zeroes. 2421 /// 2422 /// \headerfile <immintrin.h> 2423 /// 2424 /// This intrinsic corresponds to the \c VPSRLQ instruction. 2425 /// 2426 /// \param __a 2427 /// A 256-bit vector of [4 x i64] to be shifted. 2428 /// \param __count 2429 /// An unsigned integer value specifying the shift count (in bits). 2430 /// \returns A 256-bit vector of [4 x i64] containing the result. 2431 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2432 _mm256_srli_epi64(__m256i __a, int __count) 2433 { 2434 return __builtin_ia32_psrlqi256((__v4di)__a, __count); 2435 } 2436 2437 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 2438 /// right by the number of bits given in the lower 64 bits of \a __count, 2439 /// shifting in zero bits, and returns the result. If \a __count is greater 2440 /// than 63, the returned result is all zeroes. 2441 /// 2442 /// \headerfile <immintrin.h> 2443 /// 2444 /// This intrinsic corresponds to the \c VPSRLQ instruction. 2445 /// 2446 /// \param __a 2447 /// A 256-bit vector of [4 x i64] to be shifted. 2448 /// \param __count 2449 /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 2450 /// shift count (in bits). The upper element is ignored. 2451 /// \returns A 256-bit vector of [4 x i64] containing the result. 2452 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2453 _mm256_srl_epi64(__m256i __a, __m128i __count) 2454 { 2455 return __builtin_ia32_psrlq256((__v4di)__a, __count); 2456 } 2457 2458 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2459 /// vectors. Returns the lower 8 bits of each difference in the 2460 /// corresponding byte of the 256-bit integer vector result (overflow is 2461 /// ignored). 2462 /// 2463 /// \code{.operation} 2464 /// FOR i := 0 TO 31 2465 /// j := i*8 2466 /// result[j+7:j] := __a[j+7:j] - __b[j+7:j] 2467 /// ENDFOR 2468 /// \endcode 2469 /// 2470 /// \headerfile <immintrin.h> 2471 /// 2472 /// This intrinsic corresponds to the \c VPSUBB instruction. 2473 /// 2474 /// \param __a 2475 /// A 256-bit integer vector containing the minuends. 2476 /// \param __b 2477 /// A 256-bit integer vector containing the subtrahends. 2478 /// \returns A 256-bit integer vector containing the differences. 2479 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2480 _mm256_sub_epi8(__m256i __a, __m256i __b) 2481 { 2482 return (__m256i)((__v32qu)__a - (__v32qu)__b); 2483 } 2484 2485 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2486 /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in 2487 /// the corresponding element of the [16 x i16] result (overflow is 2488 /// ignored). 2489 /// 2490 /// \code{.operation} 2491 /// FOR i := 0 TO 15 2492 /// j := i*16 2493 /// result[j+15:j] := __a[j+15:j] - __b[j+15:j] 2494 /// ENDFOR 2495 /// \endcode 2496 /// 2497 /// \headerfile <immintrin.h> 2498 /// 2499 /// This intrinsic corresponds to the \c VPSUBW instruction. 2500 /// 2501 /// \param __a 2502 /// A 256-bit vector of [16 x i16] containing the minuends. 2503 /// \param __b 2504 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2505 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2506 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2507 _mm256_sub_epi16(__m256i __a, __m256i __b) 2508 { 2509 return (__m256i)((__v16hu)__a - (__v16hu)__b); 2510 } 2511 2512 /// Subtracts 32-bit integers from corresponding elements of two 256-bit 2513 /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in 2514 /// the corresponding element of the [8 x i32] result (overflow is ignored). 2515 /// 2516 /// \code{.operation} 2517 /// FOR i := 0 TO 7 2518 /// j := i*32 2519 /// result[j+31:j] := __a[j+31:j] - __b[j+31:j] 2520 /// ENDFOR 2521 /// \endcode 2522 /// 2523 /// \headerfile <immintrin.h> 2524 /// 2525 /// This intrinsic corresponds to the \c VPSUBD instruction. 2526 /// 2527 /// \param __a 2528 /// A 256-bit vector of [8 x i32] containing the minuends. 2529 /// \param __b 2530 /// A 256-bit vector of [8 x i32] containing the subtrahends. 2531 /// \returns A 256-bit vector of [8 x i32] containing the differences. 2532 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2533 _mm256_sub_epi32(__m256i __a, __m256i __b) 2534 { 2535 return (__m256i)((__v8su)__a - (__v8su)__b); 2536 } 2537 2538 /// Subtracts 64-bit integers from corresponding elements of two 256-bit 2539 /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in 2540 /// the corresponding element of the [4 x i64] result (overflow is ignored). 2541 /// 2542 /// \code{.operation} 2543 /// FOR i := 0 TO 3 2544 /// j := i*64 2545 /// result[j+63:j] := __a[j+63:j] - __b[j+63:j] 2546 /// ENDFOR 2547 /// \endcode 2548 /// 2549 /// \headerfile <immintrin.h> 2550 /// 2551 /// This intrinsic corresponds to the \c VPSUBQ instruction. 2552 /// 2553 /// \param __a 2554 /// A 256-bit vector of [4 x i64] containing the minuends. 2555 /// \param __b 2556 /// A 256-bit vector of [4 x i64] containing the subtrahends. 2557 /// \returns A 256-bit vector of [4 x i64] containing the differences. 2558 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2559 _mm256_sub_epi64(__m256i __a, __m256i __b) 2560 { 2561 return (__m256i)((__v4du)__a - (__v4du)__b); 2562 } 2563 2564 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2565 /// vectors using signed saturation, and returns each differences in the 2566 /// corresponding byte of the 256-bit integer vector result. 2567 /// 2568 /// \code{.operation} 2569 /// FOR i := 0 TO 31 2570 /// j := i*8 2571 /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j]) 2572 /// ENDFOR 2573 /// \endcode 2574 /// 2575 /// \headerfile <immintrin.h> 2576 /// 2577 /// This intrinsic corresponds to the \c VPSUBSB instruction. 2578 /// 2579 /// \param __a 2580 /// A 256-bit integer vector containing the minuends. 2581 /// \param __b 2582 /// A 256-bit integer vector containing the subtrahends. 2583 /// \returns A 256-bit integer vector containing the differences. 2584 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2585 _mm256_subs_epi8(__m256i __a, __m256i __b) 2586 { 2587 return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); 2588 } 2589 2590 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2591 /// vectors of [16 x i16] using signed saturation, and returns each 2592 /// difference in the corresponding element of the [16 x i16] result. 2593 /// 2594 /// \code{.operation} 2595 /// FOR i := 0 TO 15 2596 /// j := i*16 2597 /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j]) 2598 /// ENDFOR 2599 /// \endcode 2600 /// 2601 /// \headerfile <immintrin.h> 2602 /// 2603 /// This intrinsic corresponds to the \c VPSUBSW instruction. 2604 /// 2605 /// \param __a 2606 /// A 256-bit vector of [16 x i16] containing the minuends. 2607 /// \param __b 2608 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2609 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2610 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2611 _mm256_subs_epi16(__m256i __a, __m256i __b) 2612 { 2613 return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); 2614 } 2615 2616 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 2617 /// vectors using unsigned saturation, and returns each difference in the 2618 /// corresponding byte of the 256-bit integer vector result. For each byte, 2619 /// computes <c> result = __a - __b </c>. 2620 /// 2621 /// \code{.operation} 2622 /// FOR i := 0 TO 31 2623 /// j := i*8 2624 /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j]) 2625 /// ENDFOR 2626 /// \endcode 2627 /// 2628 /// \headerfile <immintrin.h> 2629 /// 2630 /// This intrinsic corresponds to the \c VPSUBUSB instruction. 2631 /// 2632 /// \param __a 2633 /// A 256-bit integer vector containing the minuends. 2634 /// \param __b 2635 /// A 256-bit integer vector containing the subtrahends. 2636 /// \returns A 256-bit integer vector containing the differences. 2637 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2638 _mm256_subs_epu8(__m256i __a, __m256i __b) 2639 { 2640 return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); 2641 } 2642 2643 /// Subtracts 16-bit integers from corresponding elements of two 256-bit 2644 /// vectors of [16 x i16] using unsigned saturation, and returns each 2645 /// difference in the corresponding element of the [16 x i16] result. 2646 /// 2647 /// \code{.operation} 2648 /// FOR i := 0 TO 15 2649 /// j := i*16 2650 /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j]) 2651 /// ENDFOR 2652 /// \endcode 2653 /// 2654 /// \headerfile <immintrin.h> 2655 /// 2656 /// This intrinsic corresponds to the \c VPSUBUSW instruction. 2657 /// 2658 /// \param __a 2659 /// A 256-bit vector of [16 x i16] containing the minuends. 2660 /// \param __b 2661 /// A 256-bit vector of [16 x i16] containing the subtrahends. 2662 /// \returns A 256-bit vector of [16 x i16] containing the differences. 2663 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2664 _mm256_subs_epu16(__m256i __a, __m256i __b) 2665 { 2666 return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); 2667 } 2668 2669 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2670 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2671 /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as 2672 /// input; other bits in these parameters are ignored. 2673 /// 2674 /// \code{.operation} 2675 /// result[7:0] := __a[71:64] 2676 /// result[15:8] := __b[71:64] 2677 /// result[23:16] := __a[79:72] 2678 /// result[31:24] := __b[79:72] 2679 /// . . . 2680 /// result[127:120] := __b[127:120] 2681 /// result[135:128] := __a[199:192] 2682 /// . . . 2683 /// result[255:248] := __b[255:248] 2684 /// \endcode 2685 /// 2686 /// \headerfile <immintrin.h> 2687 /// 2688 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction. 2689 /// 2690 /// \param __a 2691 /// A 256-bit integer vector used as the source for the even-numbered bytes 2692 /// of the result. 2693 /// \param __b 2694 /// A 256-bit integer vector used as the source for the odd-numbered bytes 2695 /// of the result. 2696 /// \returns A 256-bit integer vector containing the result. 2697 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2698 _mm256_unpackhi_epi8(__m256i __a, __m256i __b) 2699 { 2700 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 2701 } 2702 2703 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2704 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2705 /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each 2706 /// 128-bit half of \a __a and \a __b as input; other bits in these 2707 /// parameters are ignored. 2708 /// 2709 /// \code{.operation} 2710 /// result[15:0] := __a[79:64] 2711 /// result[31:16] := __b[79:64] 2712 /// result[47:32] := __a[95:80] 2713 /// result[63:48] := __b[95:80] 2714 /// . . . 2715 /// result[127:112] := __b[127:112] 2716 /// result[143:128] := __a[211:196] 2717 /// . . . 2718 /// result[255:240] := __b[255:240] 2719 /// \endcode 2720 /// 2721 /// \headerfile <immintrin.h> 2722 /// 2723 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction. 2724 /// 2725 /// \param __a 2726 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2727 /// elements of the result. 2728 /// \param __b 2729 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2730 /// elements of the result. 2731 /// \returns A 256-bit vector of [16 x i16] containing the result. 2732 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2733 _mm256_unpackhi_epi16(__m256i __a, __m256i __b) 2734 { 2735 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2736 } 2737 2738 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2739 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2740 /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half 2741 /// of \a __a and \a __b as input; other bits in these parameters are 2742 /// ignored. 2743 /// 2744 /// \code{.operation} 2745 /// result[31:0] := __a[95:64] 2746 /// result[63:32] := __b[95:64] 2747 /// result[95:64] := __a[127:96] 2748 /// result[127:96] := __b[127:96] 2749 /// result[159:128] := __a[223:192] 2750 /// result[191:160] := __b[223:192] 2751 /// result[223:192] := __a[255:224] 2752 /// result[255:224] := __b[255:224] 2753 /// \endcode 2754 /// 2755 /// \headerfile <immintrin.h> 2756 /// 2757 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction. 2758 /// 2759 /// \param __a 2760 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2761 /// elements of the result. 2762 /// \param __b 2763 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2764 /// elements of the result. 2765 /// \returns A 256-bit vector of [8 x i32] containing the result. 2766 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2767 _mm256_unpackhi_epi32(__m256i __a, __m256i __b) 2768 { 2769 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 2770 } 2771 2772 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2773 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2774 /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half 2775 /// of \a __a and \a __b as input; other bits in these parameters are 2776 /// ignored. 2777 /// 2778 /// \code{.operation} 2779 /// result[63:0] := __a[127:64] 2780 /// result[127:64] := __b[127:64] 2781 /// result[191:128] := __a[255:192] 2782 /// result[255:192] := __b[255:192] 2783 /// \endcode 2784 /// 2785 /// \headerfile <immintrin.h> 2786 /// 2787 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction. 2788 /// 2789 /// \param __a 2790 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2791 /// elements of the result. 2792 /// \param __b 2793 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2794 /// elements of the result. 2795 /// \returns A 256-bit vector of [4 x i64] containing the result. 2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2797 _mm256_unpackhi_epi64(__m256i __a, __m256i __b) 2798 { 2799 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); 2800 } 2801 2802 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 2803 /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 2804 /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as 2805 /// input; other bits in these parameters are ignored. 2806 /// 2807 /// \code{.operation} 2808 /// result[7:0] := __a[7:0] 2809 /// result[15:8] := __b[7:0] 2810 /// result[23:16] := __a[15:8] 2811 /// result[31:24] := __b[15:8] 2812 /// . . . 2813 /// result[127:120] := __b[63:56] 2814 /// result[135:128] := __a[135:128] 2815 /// . . . 2816 /// result[255:248] := __b[191:184] 2817 /// \endcode 2818 /// 2819 /// \headerfile <immintrin.h> 2820 /// 2821 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction. 2822 /// 2823 /// \param __a 2824 /// A 256-bit integer vector used as the source for the even-numbered bytes 2825 /// of the result. 2826 /// \param __b 2827 /// A 256-bit integer vector used as the source for the odd-numbered bytes 2828 /// of the result. 2829 /// \returns A 256-bit integer vector containing the result. 2830 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2831 _mm256_unpacklo_epi8(__m256i __a, __m256i __b) 2832 { 2833 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 2834 } 2835 2836 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 2837 /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 2838 /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each 2839 /// 128-bit half of \a __a and \a __b as input; other bits in these 2840 /// parameters are ignored. 2841 /// 2842 /// \code{.operation} 2843 /// result[15:0] := __a[15:0] 2844 /// result[31:16] := __b[15:0] 2845 /// result[47:32] := __a[31:16] 2846 /// result[63:48] := __b[31:16] 2847 /// . . . 2848 /// result[127:112] := __b[63:48] 2849 /// result[143:128] := __a[143:128] 2850 /// . . . 2851 /// result[255:239] := __b[191:176] 2852 /// \endcode 2853 /// 2854 /// \headerfile <immintrin.h> 2855 /// 2856 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction. 2857 /// 2858 /// \param __a 2859 /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 2860 /// elements of the result. 2861 /// \param __b 2862 /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 2863 /// elements of the result. 2864 /// \returns A 256-bit vector of [16 x i16] containing the result. 2865 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2866 _mm256_unpacklo_epi16(__m256i __a, __m256i __b) 2867 { 2868 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 2869 } 2870 2871 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 2872 /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 2873 /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half 2874 /// of \a __a and \a __b as input; other bits in these parameters are 2875 /// ignored. 2876 /// 2877 /// \code{.operation} 2878 /// result[31:0] := __a[31:0] 2879 /// result[63:32] := __b[31:0] 2880 /// result[95:64] := __a[63:32] 2881 /// result[127:96] := __b[63:32] 2882 /// result[159:128] := __a[159:128] 2883 /// result[191:160] := __b[159:128] 2884 /// result[223:192] := __a[191:160] 2885 /// result[255:224] := __b[191:190] 2886 /// \endcode 2887 /// 2888 /// \headerfile <immintrin.h> 2889 /// 2890 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction. 2891 /// 2892 /// \param __a 2893 /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 2894 /// elements of the result. 2895 /// \param __b 2896 /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 2897 /// elements of the result. 2898 /// \returns A 256-bit vector of [8 x i32] containing the result. 2899 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2900 _mm256_unpacklo_epi32(__m256i __a, __m256i __b) 2901 { 2902 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 2903 } 2904 2905 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 2906 /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 2907 /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half 2908 /// of \a __a and \a __b as input; other bits in these parameters are 2909 /// ignored. 2910 /// 2911 /// \code{.operation} 2912 /// result[63:0] := __a[63:0] 2913 /// result[127:64] := __b[63:0] 2914 /// result[191:128] := __a[191:128] 2915 /// result[255:192] := __b[191:128] 2916 /// \endcode 2917 /// 2918 /// \headerfile <immintrin.h> 2919 /// 2920 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction. 2921 /// 2922 /// \param __a 2923 /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 2924 /// elements of the result. 2925 /// \param __b 2926 /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 2927 /// elements of the result. 2928 /// \returns A 256-bit vector of [4 x i64] containing the result. 2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2930 _mm256_unpacklo_epi64(__m256i __a, __m256i __b) 2931 { 2932 return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); 2933 } 2934 2935 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and 2936 /// \a __b. 2937 /// 2938 /// \headerfile <immintrin.h> 2939 /// 2940 /// This intrinsic corresponds to the \c VPXOR instruction. 2941 /// 2942 /// \param __a 2943 /// A 256-bit integer vector. 2944 /// \param __b 2945 /// A 256-bit integer vector. 2946 /// \returns A 256-bit integer vector containing the result. 2947 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2948 _mm256_xor_si256(__m256i __a, __m256i __b) 2949 { 2950 return (__m256i)((__v4du)__a ^ (__v4du)__b); 2951 } 2952 2953 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal 2954 /// memory hint and returns the vector. \a __V must be aligned on a 32-byte 2955 /// boundary. 2956 /// 2957 /// \headerfile <immintrin.h> 2958 /// 2959 /// This intrinsic corresponds to the \c VMOVNTDQA instruction. 2960 /// 2961 /// \param __V 2962 /// A pointer to the 32-byte aligned memory containing the vector to load. 2963 /// \returns A 256-bit integer vector loaded from memory. 2964 static __inline__ __m256i __DEFAULT_FN_ATTRS256 2965 _mm256_stream_load_si256(__m256i const *__V) 2966 { 2967 typedef __v4di __v4di_aligned __attribute__((aligned(32))); 2968 return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); 2969 } 2970 2971 /// Broadcasts the 32-bit floating-point value from the low element of the 2972 /// 128-bit vector of [4 x float] in \a __X to all elements of the result's 2973 /// 128-bit vector of [4 x float]. 2974 /// 2975 /// \headerfile <immintrin.h> 2976 /// 2977 /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 2978 /// 2979 /// \param __X 2980 /// A 128-bit vector of [4 x float] whose low element will be broadcast. 2981 /// \returns A 128-bit vector of [4 x float] containing the result. 2982 static __inline__ __m128 __DEFAULT_FN_ATTRS128 2983 _mm_broadcastss_ps(__m128 __X) 2984 { 2985 return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); 2986 } 2987 2988 /// Broadcasts the 64-bit floating-point value from the low element of the 2989 /// 128-bit vector of [2 x double] in \a __a to both elements of the 2990 /// result's 128-bit vector of [2 x double]. 2991 /// 2992 /// \headerfile <immintrin.h> 2993 /// 2994 /// This intrinsic corresponds to the \c MOVDDUP instruction. 2995 /// 2996 /// \param __a 2997 /// A 128-bit vector of [2 x double] whose low element will be broadcast. 2998 /// \returns A 128-bit vector of [2 x double] containing the result. 2999 static __inline__ __m128d __DEFAULT_FN_ATTRS128 3000 _mm_broadcastsd_pd(__m128d __a) 3001 { 3002 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 3003 } 3004 3005 /// Broadcasts the 32-bit floating-point value from the low element of the 3006 /// 128-bit vector of [4 x float] in \a __X to all elements of the 3007 /// result's 256-bit vector of [8 x float]. 3008 /// 3009 /// \headerfile <immintrin.h> 3010 /// 3011 /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 3012 /// 3013 /// \param __X 3014 /// A 128-bit vector of [4 x float] whose low element will be broadcast. 3015 /// \returns A 256-bit vector of [8 x float] containing the result. 3016 static __inline__ __m256 __DEFAULT_FN_ATTRS256 3017 _mm256_broadcastss_ps(__m128 __X) 3018 { 3019 return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3020 } 3021 3022 /// Broadcasts the 64-bit floating-point value from the low element of the 3023 /// 128-bit vector of [2 x double] in \a __X to all elements of the 3024 /// result's 256-bit vector of [4 x double]. 3025 /// 3026 /// \headerfile <immintrin.h> 3027 /// 3028 /// This intrinsic corresponds to the \c VBROADCASTSD instruction. 3029 /// 3030 /// \param __X 3031 /// A 128-bit vector of [2 x double] whose low element will be broadcast. 3032 /// \returns A 256-bit vector of [4 x double] containing the result. 3033 static __inline__ __m256d __DEFAULT_FN_ATTRS256 3034 _mm256_broadcastsd_pd(__m128d __X) 3035 { 3036 return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); 3037 } 3038 3039 /// Broadcasts the 128-bit integer data from \a __X to both the lower and 3040 /// upper halves of the 256-bit result. 3041 /// 3042 /// \headerfile <immintrin.h> 3043 /// 3044 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction. 3045 /// 3046 /// \param __X 3047 /// A 128-bit integer vector to be broadcast. 3048 /// \returns A 256-bit integer vector containing the result. 3049 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3050 _mm256_broadcastsi128_si256(__m128i __X) 3051 { 3052 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); 3053 } 3054 3055 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) 3056 3057 /// Merges 32-bit integer elements from either of the two 128-bit vectors of 3058 /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32], 3059 /// as specified by the immediate integer operand \a M. 3060 /// 3061 /// \code{.operation} 3062 /// FOR i := 0 TO 3 3063 /// j := i*32 3064 /// IF M[i] == 0 3065 /// result[31+j:j] := V1[31+j:j] 3066 /// ELSE 3067 /// result[31+j:j] := V2[32+j:j] 3068 /// FI 3069 /// ENDFOR 3070 /// \endcode 3071 /// 3072 /// \headerfile <immintrin.h> 3073 /// 3074 /// \code 3075 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M); 3076 /// \endcode 3077 /// 3078 /// This intrinsic corresponds to the \c VPBLENDDD instruction. 3079 /// 3080 /// \param V1 3081 /// A 128-bit vector of [4 x i32] containing source values. 3082 /// \param V2 3083 /// A 128-bit vector of [4 x i32] containing source values. 3084 /// \param M 3085 /// An immediate 8-bit integer operand, with bits [3:0] specifying the 3086 /// source for each element of the result. The position of the mask bit 3087 /// corresponds to the index of a copied value. When a mask bit is 0, the 3088 /// element is copied from \a V1; otherwise, it is copied from \a V2. 3089 /// \returns A 128-bit vector of [4 x i32] containing the result. 3090 #define _mm_blend_epi32(V1, V2, M) \ 3091 ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ 3092 (__v4si)(__m128i)(V2), (int)(M))) 3093 3094 /// Merges 32-bit integer elements from either of the two 256-bit vectors of 3095 /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32], 3096 /// as specified by the immediate integer operand \a M. 3097 /// 3098 /// \code{.operation} 3099 /// FOR i := 0 TO 7 3100 /// j := i*32 3101 /// IF M[i] == 0 3102 /// result[31+j:j] := V1[31+j:j] 3103 /// ELSE 3104 /// result[31+j:j] := V2[32+j:j] 3105 /// FI 3106 /// ENDFOR 3107 /// \endcode 3108 /// 3109 /// \headerfile <immintrin.h> 3110 /// 3111 /// \code 3112 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M); 3113 /// \endcode 3114 /// 3115 /// This intrinsic corresponds to the \c VPBLENDDD instruction. 3116 /// 3117 /// \param V1 3118 /// A 256-bit vector of [8 x i32] containing source values. 3119 /// \param V2 3120 /// A 256-bit vector of [8 x i32] containing source values. 3121 /// \param M 3122 /// An immediate 8-bit integer operand, with bits [7:0] specifying the 3123 /// source for each element of the result. The position of the mask bit 3124 /// corresponds to the index of a copied value. When a mask bit is 0, the 3125 /// element is copied from \a V1; otherwise, it is is copied from \a V2. 3126 /// \returns A 256-bit vector of [8 x i32] containing the result. 3127 #define _mm256_blend_epi32(V1, V2, M) \ 3128 ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ 3129 (__v8si)(__m256i)(V2), (int)(M))) 3130 3131 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3132 /// bytes of the 256-bit result. 3133 /// 3134 /// \headerfile <immintrin.h> 3135 /// 3136 /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3137 /// 3138 /// \param __X 3139 /// A 128-bit integer vector whose low byte will be broadcast. 3140 /// \returns A 256-bit integer vector containing the result. 3141 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3142 _mm256_broadcastb_epi8(__m128i __X) 3143 { 3144 return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3145 } 3146 3147 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X 3148 /// to all elements of the result's 256-bit vector of [16 x i16]. 3149 /// 3150 /// \headerfile <immintrin.h> 3151 /// 3152 /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3153 /// 3154 /// \param __X 3155 /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3156 /// \returns A 256-bit vector of [16 x i16] containing the result. 3157 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3158 _mm256_broadcastw_epi16(__m128i __X) 3159 { 3160 return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3161 } 3162 3163 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3164 /// to all elements of the result's 256-bit vector of [8 x i32]. 3165 /// 3166 /// \headerfile <immintrin.h> 3167 /// 3168 /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3169 /// 3170 /// \param __X 3171 /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3172 /// \returns A 256-bit vector of [8 x i32] containing the result. 3173 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3174 _mm256_broadcastd_epi32(__m128i __X) 3175 { 3176 return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3177 } 3178 3179 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3180 /// to all elements of the result's 256-bit vector of [4 x i64]. 3181 /// 3182 /// \headerfile <immintrin.h> 3183 /// 3184 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3185 /// 3186 /// \param __X 3187 /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3188 /// \returns A 256-bit vector of [4 x i64] containing the result. 3189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3190 _mm256_broadcastq_epi64(__m128i __X) 3191 { 3192 return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); 3193 } 3194 3195 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 3196 /// bytes of the 128-bit result. 3197 /// 3198 /// \headerfile <immintrin.h> 3199 /// 3200 /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 3201 /// 3202 /// \param __X 3203 /// A 128-bit integer vector whose low byte will be broadcast. 3204 /// \returns A 128-bit integer vector containing the result. 3205 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3206 _mm_broadcastb_epi8(__m128i __X) 3207 { 3208 return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 3209 } 3210 3211 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in 3212 /// \a __X to all elements of the result's 128-bit vector of [8 x i16]. 3213 /// 3214 /// \headerfile <immintrin.h> 3215 /// 3216 /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 3217 /// 3218 /// \param __X 3219 /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 3220 /// \returns A 128-bit vector of [8 x i16] containing the result. 3221 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3222 _mm_broadcastw_epi16(__m128i __X) 3223 { 3224 return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); 3225 } 3226 3227 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 3228 /// to all elements of the result's vector of [4 x i32]. 3229 /// 3230 /// \headerfile <immintrin.h> 3231 /// 3232 /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 3233 /// 3234 /// \param __X 3235 /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 3236 /// \returns A 128-bit vector of [4 x i32] containing the result. 3237 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3238 _mm_broadcastd_epi32(__m128i __X) 3239 { 3240 return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); 3241 } 3242 3243 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 3244 /// to both elements of the result's 128-bit vector of [2 x i64]. 3245 /// 3246 /// \headerfile <immintrin.h> 3247 /// 3248 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 3249 /// 3250 /// \param __X 3251 /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 3252 /// \returns A 128-bit vector of [2 x i64] containing the result. 3253 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3254 _mm_broadcastq_epi64(__m128i __X) 3255 { 3256 return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); 3257 } 3258 3259 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 3260 /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the 3261 /// elements of the 256-bit vector of [8 x i32] in \a __b. 3262 /// 3263 /// \code{.operation} 3264 /// FOR i := 0 TO 7 3265 /// j := i*32 3266 /// k := __b[j+2:j] * 32 3267 /// result[j+31:j] := __a[k+31:k] 3268 /// ENDFOR 3269 /// \endcode 3270 /// 3271 /// \headerfile <immintrin.h> 3272 /// 3273 /// This intrinsic corresponds to the \c VPERMD instruction. 3274 /// 3275 /// \param __a 3276 /// A 256-bit vector of [8 x i32] containing the source values. 3277 /// \param __b 3278 /// A 256-bit vector of [8 x i32] containing indexes of values to use from 3279 /// \a __a. 3280 /// \returns A 256-bit vector of [8 x i32] containing the result. 3281 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3282 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 3283 { 3284 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 3285 } 3286 3287 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of 3288 /// the 256-bit vector of [4 x double] in \a V as specified by the 3289 /// immediate value \a M. 3290 /// 3291 /// \code{.operation} 3292 /// FOR i := 0 TO 3 3293 /// j := i*64 3294 /// k := (M >> i*2)[1:0] * 64 3295 /// result[j+63:j] := V[k+63:k] 3296 /// ENDFOR 3297 /// \endcode 3298 /// 3299 /// \headerfile <immintrin.h> 3300 /// 3301 /// \code 3302 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M); 3303 /// \endcode 3304 /// 3305 /// This intrinsic corresponds to the \c VPERMPD instruction. 3306 /// 3307 /// \param V 3308 /// A 256-bit vector of [4 x double] containing the source values. 3309 /// \param M 3310 /// An immediate 8-bit value specifying which elements to copy from \a V. 3311 /// \a M[1:0] specifies the index in \a a for element 0 of the result, 3312 /// \a M[3:2] specifies the index for element 1, and so forth. 3313 /// \returns A 256-bit vector of [4 x double] containing the result. 3314 #define _mm256_permute4x64_pd(V, M) \ 3315 ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) 3316 3317 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of 3318 /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in 3319 /// the elements of the 256-bit vector of [8 x i32] in \a __b. 3320 /// 3321 /// \code{.operation} 3322 /// FOR i := 0 TO 7 3323 /// j := i*32 3324 /// k := __b[j+2:j] * 32 3325 /// result[j+31:j] := __a[k+31:k] 3326 /// ENDFOR 3327 /// \endcode 3328 /// 3329 /// \headerfile <immintrin.h> 3330 /// 3331 /// This intrinsic corresponds to the \c VPERMPS instruction. 3332 /// 3333 /// \param __a 3334 /// A 256-bit vector of [8 x float] containing the source values. 3335 /// \param __b 3336 /// A 256-bit vector of [8 x i32] containing indexes of values to use from 3337 /// \a __a. 3338 /// \returns A 256-bit vector of [8 x float] containing the result. 3339 static __inline__ __m256 __DEFAULT_FN_ATTRS256 3340 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) 3341 { 3342 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); 3343 } 3344 3345 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements 3346 /// of the 256-bit vector of [4 x i64] in \a V as specified by the 3347 /// immediate value \a M. 3348 /// 3349 /// \code{.operation} 3350 /// FOR i := 0 TO 3 3351 /// j := i*64 3352 /// k := (M >> i*2)[1:0] * 64 3353 /// result[j+63:j] := V[k+63:k] 3354 /// ENDFOR 3355 /// \endcode 3356 /// 3357 /// \headerfile <immintrin.h> 3358 /// 3359 /// \code 3360 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M); 3361 /// \endcode 3362 /// 3363 /// This intrinsic corresponds to the \c VPERMQ instruction. 3364 /// 3365 /// \param V 3366 /// A 256-bit vector of [4 x i64] containing the source values. 3367 /// \param M 3368 /// An immediate 8-bit value specifying which elements to copy from \a V. 3369 /// \a M[1:0] specifies the index in \a a for element 0 of the result, 3370 /// \a M[3:2] specifies the index for element 1, and so forth. 3371 /// \returns A 256-bit vector of [4 x i64] containing the result. 3372 #define _mm256_permute4x64_epi64(V, M) \ 3373 ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) 3374 3375 /// Sets each half of the 256-bit result either to zero or to one of the 3376 /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2, 3377 /// as specified by the immediate value \a M. 3378 /// 3379 /// \code{.operation} 3380 /// FOR i := 0 TO 1 3381 /// j := i*128 3382 /// k := M >> (i*4) 3383 /// IF k[3] == 0 3384 /// CASE (k[1:0]) OF 3385 /// 0: result[127+j:j] := V1[127:0] 3386 /// 1: result[127+j:j] := V1[255:128] 3387 /// 2: result[127+j:j] := V2[127:0] 3388 /// 3: result[127+j:j] := V2[255:128] 3389 /// ESAC 3390 /// ELSE 3391 /// result[127+j:j] := 0 3392 /// FI 3393 /// ENDFOR 3394 /// \endcode 3395 /// 3396 /// \headerfile <immintrin.h> 3397 /// 3398 /// \code 3399 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M); 3400 /// \endcode 3401 /// 3402 /// This intrinsic corresponds to the \c VPERM2I128 instruction. 3403 /// 3404 /// \param V1 3405 /// A 256-bit integer vector containing source values. 3406 /// \param V2 3407 /// A 256-bit integer vector containing source values. 3408 /// \param M 3409 /// An immediate value specifying how to form the result. Bits [3:0] 3410 /// control the lower half of the result, bits [7:4] control the upper half. 3411 /// Within each 4-bit control value, if bit 3 is 1, the result is zero, 3412 /// otherwise bits [1:0] determine the source as follows. \n 3413 /// 0: the lower half of \a V1 \n 3414 /// 1: the upper half of \a V1 \n 3415 /// 2: the lower half of \a V2 \n 3416 /// 3: the upper half of \a V2 3417 /// \returns A 256-bit integer vector containing the result. 3418 #define _mm256_permute2x128_si256(V1, V2, M) \ 3419 ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) 3420 3421 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0 3422 /// of the immediate \a M is zero, extracts the lower half of the result; 3423 /// otherwise, extracts the upper half. 3424 /// 3425 /// \headerfile <immintrin.h> 3426 /// 3427 /// \code 3428 /// __m128i _mm256_extracti128_si256(__m256i V, const int M); 3429 /// \endcode 3430 /// 3431 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction. 3432 /// 3433 /// \param V 3434 /// A 256-bit integer vector containing the source values. 3435 /// \param M 3436 /// An immediate value specifying which half of \a V to extract. 3437 /// \returns A 128-bit integer vector containing the result. 3438 #define _mm256_extracti128_si256(V, M) \ 3439 ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) 3440 3441 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the 3442 /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M 3443 /// is zero, overwrites the lower half of the result; otherwise, 3444 /// overwrites the upper half. 3445 /// 3446 /// \headerfile <immintrin.h> 3447 /// 3448 /// \code 3449 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M); 3450 /// \endcode 3451 /// 3452 /// This intrinsic corresponds to the \c VINSERTI128 instruction. 3453 /// 3454 /// \param V1 3455 /// A 256-bit integer vector containing a source value. 3456 /// \param V2 3457 /// A 128-bit integer vector containing a source value. 3458 /// \param M 3459 /// An immediate value specifying where to put \a V2 in the result. 3460 /// \returns A 256-bit integer vector containing the result. 3461 #define _mm256_inserti128_si256(V1, V2, M) \ 3462 ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ 3463 (__v2di)(__m128i)(V2), (int)(M))) 3464 3465 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if 3466 /// the most significant bit of the corresponding element in the mask 3467 /// \a __M is set; otherwise, sets that element of the result to zero. 3468 /// Returns the 256-bit [8 x i32] result. 3469 /// 3470 /// \code{.operation} 3471 /// FOR i := 0 TO 7 3472 /// j := i*32 3473 /// IF __M[j+31] == 1 3474 /// result[j+31:j] := Load32(__X+(i*4)) 3475 /// ELSE 3476 /// result[j+31:j] := 0 3477 /// FI 3478 /// ENDFOR 3479 /// \endcode 3480 /// 3481 /// \headerfile <immintrin.h> 3482 /// 3483 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3484 /// 3485 /// \param __X 3486 /// A pointer to the memory used for loading values. 3487 /// \param __M 3488 /// A 256-bit vector of [8 x i32] containing the mask bits. 3489 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed 3490 /// elements. 3491 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3492 _mm256_maskload_epi32(int const *__X, __m256i __M) 3493 { 3494 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 3495 } 3496 3497 /// Conditionally loads four 64-bit integer elements from memory \a __X, if 3498 /// the most significant bit of the corresponding element in the mask 3499 /// \a __M is set; otherwise, sets that element of the result to zero. 3500 /// Returns the 256-bit [4 x i64] result. 3501 /// 3502 /// \code{.operation} 3503 /// FOR i := 0 TO 3 3504 /// j := i*64 3505 /// IF __M[j+63] == 1 3506 /// result[j+63:j] := Load64(__X+(i*8)) 3507 /// ELSE 3508 /// result[j+63:j] := 0 3509 /// FI 3510 /// ENDFOR 3511 /// \endcode 3512 /// 3513 /// \headerfile <immintrin.h> 3514 /// 3515 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3516 /// 3517 /// \param __X 3518 /// A pointer to the memory used for loading values. 3519 /// \param __M 3520 /// A 256-bit vector of [4 x i64] containing the mask bits. 3521 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed 3522 /// elements. 3523 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3524 _mm256_maskload_epi64(long long const *__X, __m256i __M) 3525 { 3526 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); 3527 } 3528 3529 /// Conditionally loads four 32-bit integer elements from memory \a __X, if 3530 /// the most significant bit of the corresponding element in the mask 3531 /// \a __M is set; otherwise, sets that element of the result to zero. 3532 /// Returns the 128-bit [4 x i32] result. 3533 /// 3534 /// \code{.operation} 3535 /// FOR i := 0 TO 3 3536 /// j := i*32 3537 /// IF __M[j+31] == 1 3538 /// result[j+31:j] := Load32(__X+(i*4)) 3539 /// ELSE 3540 /// result[j+31:j] := 0 3541 /// FI 3542 /// ENDFOR 3543 /// \endcode 3544 /// 3545 /// \headerfile <immintrin.h> 3546 /// 3547 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3548 /// 3549 /// \param __X 3550 /// A pointer to the memory used for loading values. 3551 /// \param __M 3552 /// A 128-bit vector of [4 x i32] containing the mask bits. 3553 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed 3554 /// elements. 3555 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3556 _mm_maskload_epi32(int const *__X, __m128i __M) 3557 { 3558 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 3559 } 3560 3561 /// Conditionally loads two 64-bit integer elements from memory \a __X, if 3562 /// the most significant bit of the corresponding element in the mask 3563 /// \a __M is set; otherwise, sets that element of the result to zero. 3564 /// Returns the 128-bit [2 x i64] result. 3565 /// 3566 /// \code{.operation} 3567 /// FOR i := 0 TO 1 3568 /// j := i*64 3569 /// IF __M[j+63] == 1 3570 /// result[j+63:j] := Load64(__X+(i*8)) 3571 /// ELSE 3572 /// result[j+63:j] := 0 3573 /// FI 3574 /// ENDFOR 3575 /// \endcode 3576 /// 3577 /// \headerfile <immintrin.h> 3578 /// 3579 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3580 /// 3581 /// \param __X 3582 /// A pointer to the memory used for loading values. 3583 /// \param __M 3584 /// A 128-bit vector of [2 x i64] containing the mask bits. 3585 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed 3586 /// elements. 3587 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3588 _mm_maskload_epi64(long long const *__X, __m128i __M) 3589 { 3590 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 3591 } 3592 3593 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector 3594 /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of 3595 /// the corresponding element in the mask \a __M is set; otherwise, the 3596 /// memory element is unchanged. 3597 /// 3598 /// \code{.operation} 3599 /// FOR i := 0 TO 7 3600 /// j := i*32 3601 /// IF __M[j+31] == 1 3602 /// Store32(__X+(i*4), __Y[j+31:j]) 3603 /// FI 3604 /// ENDFOR 3605 /// \endcode 3606 /// 3607 /// \headerfile <immintrin.h> 3608 /// 3609 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3610 /// 3611 /// \param __X 3612 /// A pointer to the memory used for storing values. 3613 /// \param __M 3614 /// A 256-bit vector of [8 x i32] containing the mask bits. 3615 /// \param __Y 3616 /// A 256-bit vector of [8 x i32] containing the values to store. 3617 static __inline__ void __DEFAULT_FN_ATTRS256 3618 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 3619 { 3620 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 3621 } 3622 3623 /// Conditionally stores four 64-bit integer elements from the 256-bit vector 3624 /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of 3625 /// the corresponding element in the mask \a __M is set; otherwise, the 3626 /// memory element is unchanged. 3627 /// 3628 /// \code{.operation} 3629 /// FOR i := 0 TO 3 3630 /// j := i*64 3631 /// IF __M[j+63] == 1 3632 /// Store64(__X+(i*8), __Y[j+63:j]) 3633 /// FI 3634 /// ENDFOR 3635 /// \endcode 3636 /// 3637 /// \headerfile <immintrin.h> 3638 /// 3639 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3640 /// 3641 /// \param __X 3642 /// A pointer to the memory used for storing values. 3643 /// \param __M 3644 /// A 256-bit vector of [4 x i64] containing the mask bits. 3645 /// \param __Y 3646 /// A 256-bit vector of [4 x i64] containing the values to store. 3647 static __inline__ void __DEFAULT_FN_ATTRS256 3648 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 3649 { 3650 __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 3651 } 3652 3653 /// Conditionally stores four 32-bit integer elements from the 128-bit vector 3654 /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of 3655 /// the corresponding element in the mask \a __M is set; otherwise, the 3656 /// memory element is unchanged. 3657 /// 3658 /// \code{.operation} 3659 /// FOR i := 0 TO 3 3660 /// j := i*32 3661 /// IF __M[j+31] == 1 3662 /// Store32(__X+(i*4), __Y[j+31:j]) 3663 /// FI 3664 /// ENDFOR 3665 /// \endcode 3666 /// 3667 /// \headerfile <immintrin.h> 3668 /// 3669 /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 3670 /// 3671 /// \param __X 3672 /// A pointer to the memory used for storing values. 3673 /// \param __M 3674 /// A 128-bit vector of [4 x i32] containing the mask bits. 3675 /// \param __Y 3676 /// A 128-bit vector of [4 x i32] containing the values to store. 3677 static __inline__ void __DEFAULT_FN_ATTRS128 3678 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 3679 { 3680 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 3681 } 3682 3683 /// Conditionally stores two 64-bit integer elements from the 128-bit vector 3684 /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of 3685 /// the corresponding element in the mask \a __M is set; otherwise, the 3686 /// memory element is unchanged. 3687 /// 3688 /// \code{.operation} 3689 /// FOR i := 0 TO 1 3690 /// j := i*64 3691 /// IF __M[j+63] == 1 3692 /// Store64(__X+(i*8), __Y[j+63:j]) 3693 /// FI 3694 /// ENDFOR 3695 /// \endcode 3696 /// 3697 /// \headerfile <immintrin.h> 3698 /// 3699 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 3700 /// 3701 /// \param __X 3702 /// A pointer to the memory used for storing values. 3703 /// \param __M 3704 /// A 128-bit vector of [2 x i64] containing the mask bits. 3705 /// \param __Y 3706 /// A 128-bit vector of [2 x i64] containing the values to store. 3707 static __inline__ void __DEFAULT_FN_ATTRS128 3708 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 3709 { 3710 __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 3711 } 3712 3713 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3714 /// left by the number of bits given in the corresponding element of the 3715 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3716 /// returns the result. If the shift count for any element is greater than 3717 /// 31, the result for that element is zero. 3718 /// 3719 /// \headerfile <immintrin.h> 3720 /// 3721 /// This intrinsic corresponds to the \c VPSLLVD instruction. 3722 /// 3723 /// \param __X 3724 /// A 256-bit vector of [8 x i32] to be shifted. 3725 /// \param __Y 3726 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3727 /// bits). 3728 /// \returns A 256-bit vector of [8 x i32] containing the result. 3729 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3730 _mm256_sllv_epi32(__m256i __X, __m256i __Y) 3731 { 3732 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 3733 } 3734 3735 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3736 /// left by the number of bits given in the corresponding element of the 3737 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3738 /// returns the result. If the shift count for any element is greater than 3739 /// 31, the result for that element is zero. 3740 /// 3741 /// \headerfile <immintrin.h> 3742 /// 3743 /// This intrinsic corresponds to the \c VPSLLVD instruction. 3744 /// 3745 /// \param __X 3746 /// A 128-bit vector of [4 x i32] to be shifted. 3747 /// \param __Y 3748 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3749 /// bits). 3750 /// \returns A 128-bit vector of [4 x i32] containing the result. 3751 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3752 _mm_sllv_epi32(__m128i __X, __m128i __Y) 3753 { 3754 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 3755 } 3756 3757 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3758 /// left by the number of bits given in the corresponding element of the 3759 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3760 /// returns the result. If the shift count for any element is greater than 3761 /// 63, the result for that element is zero. 3762 /// 3763 /// \headerfile <immintrin.h> 3764 /// 3765 /// This intrinsic corresponds to the \c VPSLLVQ instruction. 3766 /// 3767 /// \param __X 3768 /// A 256-bit vector of [4 x i64] to be shifted. 3769 /// \param __Y 3770 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3771 /// bits). 3772 /// \returns A 256-bit vector of [4 x i64] containing the result. 3773 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3774 _mm256_sllv_epi64(__m256i __X, __m256i __Y) 3775 { 3776 return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); 3777 } 3778 3779 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3780 /// left by the number of bits given in the corresponding element of the 3781 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3782 /// returns the result. If the shift count for any element is greater than 3783 /// 63, the result for that element is zero. 3784 /// 3785 /// \headerfile <immintrin.h> 3786 /// 3787 /// This intrinsic corresponds to the \c VPSLLVQ instruction. 3788 /// 3789 /// \param __X 3790 /// A 128-bit vector of [2 x i64] to be shifted. 3791 /// \param __Y 3792 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3793 /// bits). 3794 /// \returns A 128-bit vector of [2 x i64] containing the result. 3795 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3796 _mm_sllv_epi64(__m128i __X, __m128i __Y) 3797 { 3798 return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); 3799 } 3800 3801 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3802 /// right by the number of bits given in the corresponding element of the 3803 /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and 3804 /// returns the result. If the shift count for any element is greater than 3805 /// 31, the result for that element is 0 or -1 according to the sign bit 3806 /// for that element. 3807 /// 3808 /// \headerfile <immintrin.h> 3809 /// 3810 /// This intrinsic corresponds to the \c VPSRAVD instruction. 3811 /// 3812 /// \param __X 3813 /// A 256-bit vector of [8 x i32] to be shifted. 3814 /// \param __Y 3815 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3816 /// bits). 3817 /// \returns A 256-bit vector of [8 x i32] containing the result. 3818 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3819 _mm256_srav_epi32(__m256i __X, __m256i __Y) 3820 { 3821 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 3822 } 3823 3824 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3825 /// right by the number of bits given in the corresponding element of the 3826 /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and 3827 /// returns the result. If the shift count for any element is greater than 3828 /// 31, the result for that element is 0 or -1 according to the sign bit 3829 /// for that element. 3830 /// 3831 /// \headerfile <immintrin.h> 3832 /// 3833 /// This intrinsic corresponds to the \c VPSRAVD instruction. 3834 /// 3835 /// \param __X 3836 /// A 128-bit vector of [4 x i32] to be shifted. 3837 /// \param __Y 3838 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3839 /// bits). 3840 /// \returns A 128-bit vector of [4 x i32] containing the result. 3841 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3842 _mm_srav_epi32(__m128i __X, __m128i __Y) 3843 { 3844 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 3845 } 3846 3847 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 3848 /// right by the number of bits given in the corresponding element of the 3849 /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 3850 /// returns the result. If the shift count for any element is greater than 3851 /// 31, the result for that element is zero. 3852 /// 3853 /// \headerfile <immintrin.h> 3854 /// 3855 /// This intrinsic corresponds to the \c VPSRLVD instruction. 3856 /// 3857 /// \param __X 3858 /// A 256-bit vector of [8 x i32] to be shifted. 3859 /// \param __Y 3860 /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 3861 /// bits). 3862 /// \returns A 256-bit vector of [8 x i32] containing the result. 3863 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3864 _mm256_srlv_epi32(__m256i __X, __m256i __Y) 3865 { 3866 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 3867 } 3868 3869 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 3870 /// right by the number of bits given in the corresponding element of the 3871 /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 3872 /// returns the result. If the shift count for any element is greater than 3873 /// 31, the result for that element is zero. 3874 /// 3875 /// \headerfile <immintrin.h> 3876 /// 3877 /// This intrinsic corresponds to the \c VPSRLVD instruction. 3878 /// 3879 /// \param __X 3880 /// A 128-bit vector of [4 x i32] to be shifted. 3881 /// \param __Y 3882 /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 3883 /// bits). 3884 /// \returns A 128-bit vector of [4 x i32] containing the result. 3885 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3886 _mm_srlv_epi32(__m128i __X, __m128i __Y) 3887 { 3888 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 3889 } 3890 3891 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 3892 /// right by the number of bits given in the corresponding element of the 3893 /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 3894 /// returns the result. If the shift count for any element is greater than 3895 /// 63, the result for that element is zero. 3896 /// 3897 /// \headerfile <immintrin.h> 3898 /// 3899 /// This intrinsic corresponds to the \c VPSRLVQ instruction. 3900 /// 3901 /// \param __X 3902 /// A 256-bit vector of [4 x i64] to be shifted. 3903 /// \param __Y 3904 /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 3905 /// bits). 3906 /// \returns A 256-bit vector of [4 x i64] containing the result. 3907 static __inline__ __m256i __DEFAULT_FN_ATTRS256 3908 _mm256_srlv_epi64(__m256i __X, __m256i __Y) 3909 { 3910 return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); 3911 } 3912 3913 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 3914 /// right by the number of bits given in the corresponding element of the 3915 /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 3916 /// returns the result. If the shift count for any element is greater than 3917 /// 63, the result for that element is zero. 3918 /// 3919 /// \headerfile <immintrin.h> 3920 /// 3921 /// This intrinsic corresponds to the \c VPSRLVQ instruction. 3922 /// 3923 /// \param __X 3924 /// A 128-bit vector of [2 x i64] to be shifted. 3925 /// \param __Y 3926 /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 3927 /// bits). 3928 /// \returns A 128-bit vector of [2 x i64] containing the result. 3929 static __inline__ __m128i __DEFAULT_FN_ATTRS128 3930 _mm_srlv_epi64(__m128i __X, __m128i __Y) 3931 { 3932 return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); 3933 } 3934 3935 /// Conditionally gathers two 64-bit floating-point values, either from the 3936 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 3937 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 3938 /// of [2 x double] in \a mask determines the source for each element. 3939 /// 3940 /// \code{.operation} 3941 /// FOR element := 0 to 1 3942 /// j := element*64 3943 /// k := element*32 3944 /// IF mask[j+63] == 0 3945 /// result[j+63:j] := a[j+63:j] 3946 /// ELSE 3947 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 3948 /// FI 3949 /// ENDFOR 3950 /// \endcode 3951 /// 3952 /// \headerfile <immintrin.h> 3953 /// 3954 /// \code 3955 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i, 3956 /// __m128d mask, const int s); 3957 /// \endcode 3958 /// 3959 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 3960 /// 3961 /// \param a 3962 /// A 128-bit vector of [2 x double] used as the source when a mask bit is 3963 /// zero. 3964 /// \param m 3965 /// A pointer to the memory used for loading values. 3966 /// \param i 3967 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 3968 /// the first two elements are used. 3969 /// \param mask 3970 /// A 128-bit vector of [2 x double] containing the mask. The most 3971 /// significant bit of each element in the mask vector represents the mask 3972 /// bits. If a mask bit is zero, the corresponding value from vector \a a 3973 /// is gathered; otherwise the value is loaded from memory. 3974 /// \param s 3975 /// A literal constant scale factor for the indexes in \a i. Must be 3976 /// 1, 2, 4, or 8. 3977 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 3978 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \ 3979 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ 3980 (double const *)(m), \ 3981 (__v4si)(__m128i)(i), \ 3982 (__v2df)(__m128d)(mask), (s))) 3983 3984 /// Conditionally gathers four 64-bit floating-point values, either from the 3985 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 3986 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 3987 /// of [4 x double] in \a mask determines the source for each element. 3988 /// 3989 /// \code{.operation} 3990 /// FOR element := 0 to 3 3991 /// j := element*64 3992 /// k := element*32 3993 /// IF mask[j+63] == 0 3994 /// result[j+63:j] := a[j+63:j] 3995 /// ELSE 3996 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 3997 /// FI 3998 /// ENDFOR 3999 /// \endcode 4000 /// 4001 /// \headerfile <immintrin.h> 4002 /// 4003 /// \code 4004 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i, 4005 /// __m256d mask, const int s); 4006 /// \endcode 4007 /// 4008 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4009 /// 4010 /// \param a 4011 /// A 256-bit vector of [4 x double] used as the source when a mask bit is 4012 /// zero. 4013 /// \param m 4014 /// A pointer to the memory used for loading values. 4015 /// \param i 4016 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4017 /// \param mask 4018 /// A 256-bit vector of [4 x double] containing the mask. The most 4019 /// significant bit of each element in the mask vector represents the mask 4020 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4021 /// is gathered; otherwise the value is loaded from memory. 4022 /// \param s 4023 /// A literal constant scale factor for the indexes in \a i. Must be 4024 /// 1, 2, 4, or 8. 4025 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4026 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ 4027 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ 4028 (double const *)(m), \ 4029 (__v4si)(__m128i)(i), \ 4030 (__v4df)(__m256d)(mask), (s))) 4031 4032 /// Conditionally gathers two 64-bit floating-point values, either from the 4033 /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 4034 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4035 /// of [2 x double] in \a mask determines the source for each element. 4036 /// 4037 /// \code{.operation} 4038 /// FOR element := 0 to 1 4039 /// j := element*64 4040 /// k := element*64 4041 /// IF mask[j+63] == 0 4042 /// result[j+63:j] := a[j+63:j] 4043 /// ELSE 4044 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4045 /// FI 4046 /// ENDFOR 4047 /// \endcode 4048 /// 4049 /// \headerfile <immintrin.h> 4050 /// 4051 /// \code 4052 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i, 4053 /// __m128d mask, const int s); 4054 /// \endcode 4055 /// 4056 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4057 /// 4058 /// \param a 4059 /// A 128-bit vector of [2 x double] used as the source when a mask bit is 4060 /// zero. 4061 /// \param m 4062 /// A pointer to the memory used for loading values. 4063 /// \param i 4064 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4065 /// \param mask 4066 /// A 128-bit vector of [2 x double] containing the mask. The most 4067 /// significant bit of each element in the mask vector represents the mask 4068 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4069 /// is gathered; otherwise the value is loaded from memory. 4070 /// \param s 4071 /// A literal constant scale factor for the indexes in \a i. Must be 4072 /// 1, 2, 4, or 8. 4073 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4074 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \ 4075 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ 4076 (double const *)(m), \ 4077 (__v2di)(__m128i)(i), \ 4078 (__v2df)(__m128d)(mask), (s))) 4079 4080 /// Conditionally gathers four 64-bit floating-point values, either from the 4081 /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 4082 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4083 /// of [4 x double] in \a mask determines the source for each element. 4084 /// 4085 /// \code{.operation} 4086 /// FOR element := 0 to 3 4087 /// j := element*64 4088 /// k := element*64 4089 /// IF mask[j+63] == 0 4090 /// result[j+63:j] := a[j+63:j] 4091 /// ELSE 4092 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4093 /// FI 4094 /// ENDFOR 4095 /// \endcode 4096 /// 4097 /// \headerfile <immintrin.h> 4098 /// 4099 /// \code 4100 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i, 4101 /// __m256d mask, const int s); 4102 /// \endcode 4103 /// 4104 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4105 /// 4106 /// \param a 4107 /// A 256-bit vector of [4 x double] used as the source when a mask bit is 4108 /// zero. 4109 /// \param m 4110 /// A pointer to the memory used for loading values. 4111 /// \param i 4112 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4113 /// \param mask 4114 /// A 256-bit vector of [4 x double] containing the mask. The most 4115 /// significant bit of each element in the mask vector represents the mask 4116 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4117 /// is gathered; otherwise the value is loaded from memory. 4118 /// \param s 4119 /// A literal constant scale factor for the indexes in \a i. Must be 4120 /// 1, 2, 4, or 8. 4121 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4122 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ 4123 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ 4124 (double const *)(m), \ 4125 (__v4di)(__m256i)(i), \ 4126 (__v4df)(__m256d)(mask), (s))) 4127 4128 /// Conditionally gathers four 32-bit floating-point values, either from the 4129 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4130 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4131 /// of [4 x float] in \a mask determines the source for each element. 4132 /// 4133 /// \code{.operation} 4134 /// FOR element := 0 to 3 4135 /// j := element*32 4136 /// k := element*32 4137 /// IF mask[j+31] == 0 4138 /// result[j+31:j] := a[j+31:j] 4139 /// ELSE 4140 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4141 /// FI 4142 /// ENDFOR 4143 /// \endcode 4144 /// 4145 /// \headerfile <immintrin.h> 4146 /// 4147 /// \code 4148 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i, 4149 /// __m128 mask, const int s); 4150 /// \endcode 4151 /// 4152 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4153 /// 4154 /// \param a 4155 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4156 /// zero. 4157 /// \param m 4158 /// A pointer to the memory used for loading values. 4159 /// \param i 4160 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4161 /// \param mask 4162 /// A 128-bit vector of [4 x float] containing the mask. The most 4163 /// significant bit of each element in the mask vector represents the mask 4164 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4165 /// is gathered; otherwise the value is loaded from memory. 4166 /// \param s 4167 /// A literal constant scale factor for the indexes in \a i. Must be 4168 /// 1, 2, 4, or 8. 4169 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4170 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \ 4171 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ 4172 (float const *)(m), \ 4173 (__v4si)(__m128i)(i), \ 4174 (__v4sf)(__m128)(mask), (s))) 4175 4176 /// Conditionally gathers eight 32-bit floating-point values, either from the 4177 /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled 4178 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4179 /// of [8 x float] in \a mask determines the source for each element. 4180 /// 4181 /// \code{.operation} 4182 /// FOR element := 0 to 7 4183 /// j := element*32 4184 /// k := element*32 4185 /// IF mask[j+31] == 0 4186 /// result[j+31:j] := a[j+31:j] 4187 /// ELSE 4188 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4189 /// FI 4190 /// ENDFOR 4191 /// \endcode 4192 /// 4193 /// \headerfile <immintrin.h> 4194 /// 4195 /// \code 4196 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i, 4197 /// __m256 mask, const int s); 4198 /// \endcode 4199 /// 4200 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4201 /// 4202 /// \param a 4203 /// A 256-bit vector of [8 x float] used as the source when a mask bit is 4204 /// zero. 4205 /// \param m 4206 /// A pointer to the memory used for loading values. 4207 /// \param i 4208 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4209 /// \param mask 4210 /// A 256-bit vector of [8 x float] containing the mask. The most 4211 /// significant bit of each element in the mask vector represents the mask 4212 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4213 /// is gathered; otherwise the value is loaded from memory. 4214 /// \param s 4215 /// A literal constant scale factor for the indexes in \a i. Must be 4216 /// 1, 2, 4, or 8. 4217 /// \returns A 256-bit vector of [8 x float] containing the gathered values. 4218 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ 4219 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ 4220 (float const *)(m), \ 4221 (__v8si)(__m256i)(i), \ 4222 (__v8sf)(__m256)(mask), (s))) 4223 4224 /// Conditionally gathers two 32-bit floating-point values, either from the 4225 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4226 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4227 /// of [4 x float] in \a mask determines the source for the lower two 4228 /// elements. The upper two elements of the result are zeroed. 4229 /// 4230 /// \code{.operation} 4231 /// FOR element := 0 to 1 4232 /// j := element*32 4233 /// k := element*64 4234 /// IF mask[j+31] == 0 4235 /// result[j+31:j] := a[j+31:j] 4236 /// ELSE 4237 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4238 /// FI 4239 /// ENDFOR 4240 /// result[127:64] := 0 4241 /// \endcode 4242 /// 4243 /// \headerfile <immintrin.h> 4244 /// 4245 /// \code 4246 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i, 4247 /// __m128 mask, const int s); 4248 /// \endcode 4249 /// 4250 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4251 /// 4252 /// \param a 4253 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4254 /// zero. Only the first two elements are used. 4255 /// \param m 4256 /// A pointer to the memory used for loading values. 4257 /// \param i 4258 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4259 /// \param mask 4260 /// A 128-bit vector of [4 x float] containing the mask. The most 4261 /// significant bit of each element in the mask vector represents the mask 4262 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4263 /// is gathered; otherwise the value is loaded from memory. Only the first 4264 /// two elements are used. 4265 /// \param s 4266 /// A literal constant scale factor for the indexes in \a i. Must be 4267 /// 1, 2, 4, or 8. 4268 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4269 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \ 4270 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ 4271 (float const *)(m), \ 4272 (__v2di)(__m128i)(i), \ 4273 (__v4sf)(__m128)(mask), (s))) 4274 4275 /// Conditionally gathers four 32-bit floating-point values, either from the 4276 /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 4277 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4278 /// of [4 x float] in \a mask determines the source for each element. 4279 /// 4280 /// \code{.operation} 4281 /// FOR element := 0 to 3 4282 /// j := element*32 4283 /// k := element*64 4284 /// IF mask[j+31] == 0 4285 /// result[j+31:j] := a[j+31:j] 4286 /// ELSE 4287 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4288 /// FI 4289 /// ENDFOR 4290 /// \endcode 4291 /// 4292 /// \headerfile <immintrin.h> 4293 /// 4294 /// \code 4295 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i, 4296 /// __m128 mask, const int s); 4297 /// \endcode 4298 /// 4299 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4300 /// 4301 /// \param a 4302 /// A 128-bit vector of [4 x float] used as the source when a mask bit is 4303 /// zero. 4304 /// \param m 4305 /// A pointer to the memory used for loading values. 4306 /// \param i 4307 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4308 /// \param mask 4309 /// A 128-bit vector of [4 x float] containing the mask. The most 4310 /// significant bit of each element in the mask vector represents the mask 4311 /// bits. If a mask bit is zero, the corresponding value from vector \a a 4312 /// is gathered; otherwise the value is loaded from memory. 4313 /// \param s 4314 /// A literal constant scale factor for the indexes in \a i. Must be 4315 /// 1, 2, 4, or 8. 4316 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4317 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ 4318 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ 4319 (float const *)(m), \ 4320 (__v4di)(__m256i)(i), \ 4321 (__v4sf)(__m128)(mask), (s))) 4322 4323 /// Conditionally gathers four 32-bit integer values, either from the 4324 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4325 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4326 /// of [4 x i32] in \a mask determines the source for each element. 4327 /// 4328 /// \code{.operation} 4329 /// FOR element := 0 to 3 4330 /// j := element*32 4331 /// k := element*32 4332 /// IF mask[j+31] == 0 4333 /// result[j+31:j] := a[j+31:j] 4334 /// ELSE 4335 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4336 /// FI 4337 /// ENDFOR 4338 /// \endcode 4339 /// 4340 /// \headerfile <immintrin.h> 4341 /// 4342 /// \code 4343 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i, 4344 /// __m128i mask, const int s); 4345 /// \endcode 4346 /// 4347 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 4348 /// 4349 /// \param a 4350 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4351 /// zero. 4352 /// \param m 4353 /// A pointer to the memory used for loading values. 4354 /// \param i 4355 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4356 /// \param mask 4357 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4358 /// bit of each element in the mask vector represents the mask bits. If a 4359 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4360 /// otherwise the value is loaded from memory. 4361 /// \param s 4362 /// A literal constant scale factor for the indexes in \a i. Must be 4363 /// 1, 2, 4, or 8. 4364 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4365 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ 4366 ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ 4367 (int const *)(m), \ 4368 (__v4si)(__m128i)(i), \ 4369 (__v4si)(__m128i)(mask), (s))) 4370 4371 /// Conditionally gathers eight 32-bit integer values, either from the 4372 /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled 4373 /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 4374 /// of [8 x i32] in \a mask determines the source for each element. 4375 /// 4376 /// \code{.operation} 4377 /// FOR element := 0 to 7 4378 /// j := element*32 4379 /// k := element*32 4380 /// IF mask[j+31] == 0 4381 /// result[j+31:j] := a[j+31:j] 4382 /// ELSE 4383 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4384 /// FI 4385 /// ENDFOR 4386 /// \endcode 4387 /// 4388 /// \headerfile <immintrin.h> 4389 /// 4390 /// \code 4391 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i, 4392 /// __m256i mask, const int s); 4393 /// \endcode 4394 /// 4395 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 4396 /// 4397 /// \param a 4398 /// A 256-bit vector of [8 x i32] used as the source when a mask bit is 4399 /// zero. 4400 /// \param m 4401 /// A pointer to the memory used for loading values. 4402 /// \param i 4403 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4404 /// \param mask 4405 /// A 256-bit vector of [8 x i32] containing the mask. The most significant 4406 /// bit of each element in the mask vector represents the mask bits. If a 4407 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4408 /// otherwise the value is loaded from memory. 4409 /// \param s 4410 /// A literal constant scale factor for the indexes in \a i. Must be 4411 /// 1, 2, 4, or 8. 4412 /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 4413 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ 4414 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ 4415 (int const *)(m), \ 4416 (__v8si)(__m256i)(i), \ 4417 (__v8si)(__m256i)(mask), (s))) 4418 4419 /// Conditionally gathers two 32-bit integer values, either from the 4420 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4421 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4422 /// of [4 x i32] in \a mask determines the source for the lower two 4423 /// elements. The upper two elements of the result are zeroed. 4424 /// 4425 /// \code{.operation} 4426 /// FOR element := 0 to 1 4427 /// j := element*32 4428 /// k := element*64 4429 /// IF mask[j+31] == 0 4430 /// result[j+31:j] := a[j+31:j] 4431 /// ELSE 4432 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4433 /// FI 4434 /// ENDFOR 4435 /// result[127:64] := 0 4436 /// \endcode 4437 /// 4438 /// \headerfile <immintrin.h> 4439 /// 4440 /// \code 4441 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i, 4442 /// __m128i mask, const int s); 4443 /// \endcode 4444 /// 4445 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 4446 /// 4447 /// \param a 4448 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4449 /// zero. Only the first two elements are used. 4450 /// \param m 4451 /// A pointer to the memory used for loading values. 4452 /// \param i 4453 /// A 128-bit vector of [2 x i64] containing indexes into \a m. 4454 /// \param mask 4455 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4456 /// bit of each element in the mask vector represents the mask bits. If a 4457 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4458 /// otherwise the value is loaded from memory. Only the first two elements 4459 /// are used. 4460 /// \param s 4461 /// A literal constant scale factor for the indexes in \a i. Must be 4462 /// 1, 2, 4, or 8. 4463 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4464 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ 4465 ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ 4466 (int const *)(m), \ 4467 (__v2di)(__m128i)(i), \ 4468 (__v4si)(__m128i)(mask), (s))) 4469 4470 /// Conditionally gathers four 32-bit integer values, either from the 4471 /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 4472 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 4473 /// of [4 x i32] in \a mask determines the source for each element. 4474 /// 4475 /// \code{.operation} 4476 /// FOR element := 0 to 3 4477 /// j := element*32 4478 /// k := element*64 4479 /// IF mask[j+31] == 0 4480 /// result[j+31:j] := a[j+31:j] 4481 /// ELSE 4482 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4483 /// FI 4484 /// ENDFOR 4485 /// \endcode 4486 /// 4487 /// \headerfile <immintrin.h> 4488 /// 4489 /// \code 4490 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i, 4491 /// __m128i mask, const int s); 4492 /// \endcode 4493 /// 4494 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 4495 /// 4496 /// \param a 4497 /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 4498 /// zero. 4499 /// \param m 4500 /// A pointer to the memory used for loading values. 4501 /// \param i 4502 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4503 /// \param mask 4504 /// A 128-bit vector of [4 x i32] containing the mask. The most significant 4505 /// bit of each element in the mask vector represents the mask bits. If a 4506 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4507 /// otherwise the value is loaded from memory. 4508 /// \param s 4509 /// A literal constant scale factor for the indexes in \a i. Must be 4510 /// 1, 2, 4, or 8. 4511 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 4512 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ 4513 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ 4514 (int const *)(m), \ 4515 (__v4di)(__m256i)(i), \ 4516 (__v4si)(__m128i)(mask), (s))) 4517 4518 /// Conditionally gathers two 64-bit integer values, either from the 4519 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4520 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 4521 /// of [2 x i64] in \a mask determines the source for each element. 4522 /// 4523 /// \code{.operation} 4524 /// FOR element := 0 to 1 4525 /// j := element*64 4526 /// k := element*32 4527 /// IF mask[j+63] == 0 4528 /// result[j+63:j] := a[j+63:j] 4529 /// ELSE 4530 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4531 /// FI 4532 /// ENDFOR 4533 /// \endcode 4534 /// 4535 /// \headerfile <immintrin.h> 4536 /// 4537 /// \code 4538 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i, 4539 /// __m128i mask, const int s); 4540 /// \endcode 4541 /// 4542 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4543 /// 4544 /// \param a 4545 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4546 /// zero. 4547 /// \param m 4548 /// A pointer to the memory used for loading values. 4549 /// \param i 4550 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4551 /// the first two elements are used. 4552 /// \param mask 4553 /// A 128-bit vector of [2 x i64] containing the mask. The most significant 4554 /// bit of each element in the mask vector represents the mask bits. If a 4555 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4556 /// otherwise the value is loaded from memory. 4557 /// \param s 4558 /// A literal constant scale factor for the indexes in \a i. Must be 4559 /// 1, 2, 4, or 8. 4560 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4561 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ 4562 ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ 4563 (long long const *)(m), \ 4564 (__v4si)(__m128i)(i), \ 4565 (__v2di)(__m128i)(mask), (s))) 4566 4567 /// Conditionally gathers four 64-bit integer values, either from the 4568 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4569 /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 4570 /// of [4 x i64] in \a mask determines the source for each element. 4571 /// 4572 /// \code{.operation} 4573 /// FOR element := 0 to 3 4574 /// j := element*64 4575 /// k := element*32 4576 /// IF mask[j+63] == 0 4577 /// result[j+63:j] := a[j+63:j] 4578 /// ELSE 4579 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4580 /// FI 4581 /// ENDFOR 4582 /// \endcode 4583 /// 4584 /// \headerfile <immintrin.h> 4585 /// 4586 /// \code 4587 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m, 4588 /// __m128i i, __m256i mask, const int s); 4589 /// \endcode 4590 /// 4591 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 4592 /// 4593 /// \param a 4594 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4595 /// zero. 4596 /// \param m 4597 /// A pointer to the memory used for loading values. 4598 /// \param i 4599 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4600 /// \param mask 4601 /// A 256-bit vector of [4 x i64] containing the mask. The most significant 4602 /// bit of each element in the mask vector represents the mask bits. If a 4603 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4604 /// otherwise the value is loaded from memory. 4605 /// \param s 4606 /// A literal constant scale factor for the indexes in \a i. Must be 4607 /// 1, 2, 4, or 8. 4608 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4609 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ 4610 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ 4611 (long long const *)(m), \ 4612 (__v4si)(__m128i)(i), \ 4613 (__v4di)(__m256i)(mask), (s))) 4614 4615 /// Conditionally gathers two 64-bit integer values, either from the 4616 /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 4617 /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 4618 /// of [2 x i64] in \a mask determines the source for each element. 4619 /// 4620 /// \code{.operation} 4621 /// FOR element := 0 to 1 4622 /// j := element*64 4623 /// k := element*64 4624 /// IF mask[j+63] == 0 4625 /// result[j+63:j] := a[j+63:j] 4626 /// ELSE 4627 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4628 /// FI 4629 /// ENDFOR 4630 /// \endcode 4631 /// 4632 /// \headerfile <immintrin.h> 4633 /// 4634 /// \code 4635 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i, 4636 /// __m128i mask, const int s); 4637 /// \endcode 4638 /// 4639 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4640 /// 4641 /// \param a 4642 /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 4643 /// zero. 4644 /// \param m 4645 /// A pointer to the memory used for loading values. 4646 /// \param i 4647 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4648 /// \param mask 4649 /// A 128-bit vector of [2 x i64] containing the mask. The most significant 4650 /// bit of each element in the mask vector represents the mask bits. If a 4651 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4652 /// otherwise the value is loaded from memory. 4653 /// \param s 4654 /// A literal constant scale factor for the indexes in \a i. Must be 4655 /// 1, 2, 4, or 8. 4656 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 4657 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ 4658 ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ 4659 (long long const *)(m), \ 4660 (__v2di)(__m128i)(i), \ 4661 (__v2di)(__m128i)(mask), (s))) 4662 4663 /// Conditionally gathers four 64-bit integer values, either from the 4664 /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 4665 /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 4666 /// of [4 x i64] in \a mask determines the source for each element. 4667 /// 4668 /// \code{.operation} 4669 /// FOR element := 0 to 3 4670 /// j := element*64 4671 /// k := element*64 4672 /// IF mask[j+63] == 0 4673 /// result[j+63:j] := a[j+63:j] 4674 /// ELSE 4675 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4676 /// FI 4677 /// ENDFOR 4678 /// \endcode 4679 /// 4680 /// \headerfile <immintrin.h> 4681 /// 4682 /// \code 4683 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m, 4684 /// __m256i i, __m256i mask, const int s); 4685 /// \endcode 4686 /// 4687 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 4688 /// 4689 /// \param a 4690 /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 4691 /// zero. 4692 /// \param m 4693 /// A pointer to the memory used for loading values. 4694 /// \param i 4695 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4696 /// \param mask 4697 /// A 256-bit vector of [4 x i64] containing the mask. The most significant 4698 /// bit of each element in the mask vector represents the mask bits. If a 4699 /// mask bit is zero, the corresponding value from vector \a a is gathered; 4700 /// otherwise the value is loaded from memory. 4701 /// \param s 4702 /// A literal constant scale factor for the indexes in \a i. Must be 4703 /// 1, 2, 4, or 8. 4704 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 4705 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ 4706 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ 4707 (long long const *)(m), \ 4708 (__v4di)(__m256i)(i), \ 4709 (__v4di)(__m256i)(mask), (s))) 4710 4711 /// Gathers two 64-bit floating-point values from memory \a m using scaled 4712 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4713 /// 4714 /// \code{.operation} 4715 /// FOR element := 0 to 1 4716 /// j := element*64 4717 /// k := element*32 4718 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4719 /// ENDFOR 4720 /// \endcode 4721 /// 4722 /// \headerfile <immintrin.h> 4723 /// 4724 /// \code 4725 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s); 4726 /// \endcode 4727 /// 4728 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4729 /// 4730 /// \param m 4731 /// A pointer to the memory used for loading values. 4732 /// \param i 4733 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 4734 /// the first two elements are used. 4735 /// \param s 4736 /// A literal constant scale factor for the indexes in \a i. Must be 4737 /// 1, 2, 4, or 8. 4738 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4739 #define _mm_i32gather_pd(m, i, s) \ 4740 ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ 4741 (double const *)(m), \ 4742 (__v4si)(__m128i)(i), \ 4743 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4744 _mm_setzero_pd()), \ 4745 (s))) 4746 4747 /// Gathers four 64-bit floating-point values from memory \a m using scaled 4748 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4749 /// 4750 /// \code{.operation} 4751 /// FOR element := 0 to 3 4752 /// j := element*64 4753 /// k := element*32 4754 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 4755 /// ENDFOR 4756 /// \endcode 4757 /// 4758 /// \headerfile <immintrin.h> 4759 /// 4760 /// \code 4761 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s); 4762 /// \endcode 4763 /// 4764 /// This intrinsic corresponds to the \c VGATHERDPD instruction. 4765 /// 4766 /// \param m 4767 /// A pointer to the memory used for loading values. 4768 /// \param i 4769 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4770 /// \param s 4771 /// A literal constant scale factor for the indexes in \a i. Must be 4772 /// 1, 2, 4, or 8. 4773 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4774 #define _mm256_i32gather_pd(m, i, s) \ 4775 ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ 4776 (double const *)(m), \ 4777 (__v4si)(__m128i)(i), \ 4778 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4779 _mm256_setzero_pd(), \ 4780 _CMP_EQ_OQ), \ 4781 (s))) 4782 4783 /// Gathers two 64-bit floating-point values from memory \a m using scaled 4784 /// indexes from the 128-bit vector of [2 x i64] in \a i. 4785 /// 4786 /// \code{.operation} 4787 /// FOR element := 0 to 1 4788 /// j := element*64 4789 /// k := element*64 4790 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4791 /// ENDFOR 4792 /// \endcode 4793 /// 4794 /// \headerfile <immintrin.h> 4795 /// 4796 /// \code 4797 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s); 4798 /// \endcode 4799 /// 4800 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4801 /// 4802 /// \param m 4803 /// A pointer to the memory used for loading values. 4804 /// \param i 4805 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4806 /// \param s 4807 /// A literal constant scale factor for the indexes in \a i. Must be 4808 /// 1, 2, 4, or 8. 4809 /// \returns A 128-bit vector of [2 x double] containing the gathered values. 4810 #define _mm_i64gather_pd(m, i, s) \ 4811 ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ 4812 (double const *)(m), \ 4813 (__v2di)(__m128i)(i), \ 4814 (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 4815 _mm_setzero_pd()), \ 4816 (s))) 4817 4818 /// Gathers four 64-bit floating-point values from memory \a m using scaled 4819 /// indexes from the 256-bit vector of [4 x i64] in \a i. 4820 /// 4821 /// \code{.operation} 4822 /// FOR element := 0 to 3 4823 /// j := element*64 4824 /// k := element*64 4825 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 4826 /// ENDFOR 4827 /// \endcode 4828 /// 4829 /// \headerfile <immintrin.h> 4830 /// 4831 /// \code 4832 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s); 4833 /// \endcode 4834 /// 4835 /// This intrinsic corresponds to the \c VGATHERQPD instruction. 4836 /// 4837 /// \param m 4838 /// A pointer to the memory used for loading values. 4839 /// \param i 4840 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4841 /// \param s 4842 /// A literal constant scale factor for the indexes in \a i. Must be 4843 /// 1, 2, 4, or 8. 4844 /// \returns A 256-bit vector of [4 x double] containing the gathered values. 4845 #define _mm256_i64gather_pd(m, i, s) \ 4846 ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ 4847 (double const *)(m), \ 4848 (__v4di)(__m256i)(i), \ 4849 (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 4850 _mm256_setzero_pd(), \ 4851 _CMP_EQ_OQ), \ 4852 (s))) 4853 4854 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4855 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4856 /// 4857 /// \code{.operation} 4858 /// FOR element := 0 to 3 4859 /// j := element*32 4860 /// k := element*32 4861 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4862 /// ENDFOR 4863 /// \endcode 4864 /// 4865 /// \headerfile <immintrin.h> 4866 /// 4867 /// \code 4868 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s); 4869 /// \endcode 4870 /// 4871 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4872 /// 4873 /// \param m 4874 /// A pointer to the memory used for loading values. 4875 /// \param i 4876 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 4877 /// \param s 4878 /// A literal constant scale factor for the indexes in \a i. Must be 4879 /// 1, 2, 4, or 8. 4880 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4881 #define _mm_i32gather_ps(m, i, s) \ 4882 ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ 4883 (float const *)(m), \ 4884 (__v4si)(__m128i)(i), \ 4885 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4886 _mm_setzero_ps()), \ 4887 (s))) 4888 4889 /// Gathers eight 32-bit floating-point values from memory \a m using scaled 4890 /// indexes from the 256-bit vector of [8 x i32] in \a i. 4891 /// 4892 /// \code{.operation} 4893 /// FOR element := 0 to 7 4894 /// j := element*32 4895 /// k := element*32 4896 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 4897 /// ENDFOR 4898 /// \endcode 4899 /// 4900 /// \headerfile <immintrin.h> 4901 /// 4902 /// \code 4903 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s); 4904 /// \endcode 4905 /// 4906 /// This intrinsic corresponds to the \c VGATHERDPS instruction. 4907 /// 4908 /// \param m 4909 /// A pointer to the memory used for loading values. 4910 /// \param i 4911 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 4912 /// \param s 4913 /// A literal constant scale factor for the indexes in \a i. Must be 4914 /// 1, 2, 4, or 8. 4915 /// \returns A 256-bit vector of [8 x float] containing the gathered values. 4916 #define _mm256_i32gather_ps(m, i, s) \ 4917 ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ 4918 (float const *)(m), \ 4919 (__v8si)(__m256i)(i), \ 4920 (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ 4921 _mm256_setzero_ps(), \ 4922 _CMP_EQ_OQ), \ 4923 (s))) 4924 4925 /// Gathers two 32-bit floating-point values from memory \a m using scaled 4926 /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two 4927 /// elements of the result are zeroed. 4928 /// 4929 /// \code{.operation} 4930 /// FOR element := 0 to 1 4931 /// j := element*32 4932 /// k := element*64 4933 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 4934 /// ENDFOR 4935 /// result[127:64] := 0 4936 /// \endcode 4937 /// 4938 /// \headerfile <immintrin.h> 4939 /// 4940 /// \code 4941 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s); 4942 /// \endcode 4943 /// 4944 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4945 /// 4946 /// \param m 4947 /// A pointer to the memory used for loading values. 4948 /// \param i 4949 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 4950 /// \param s 4951 /// A literal constant scale factor for the indexes in \a i. Must be 4952 /// 1, 2, 4, or 8. 4953 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4954 #define _mm_i64gather_ps(m, i, s) \ 4955 ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ 4956 (float const *)(m), \ 4957 (__v2di)(__m128i)(i), \ 4958 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4959 _mm_setzero_ps()), \ 4960 (s))) 4961 4962 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4963 /// indexes from the 256-bit vector of [4 x i64] in \a i. 4964 /// 4965 /// \code{.operation} 4966 /// FOR element := 0 to 3 4967 /// j := element*32 4968 /// k := element*64 4969 /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s) 4970 /// ENDFOR 4971 /// \endcode 4972 /// 4973 /// \headerfile <immintrin.h> 4974 /// 4975 /// \code 4976 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s); 4977 /// \endcode 4978 /// 4979 /// This intrinsic corresponds to the \c VGATHERQPS instruction. 4980 /// 4981 /// \param m 4982 /// A pointer to the memory used for loading values. 4983 /// \param i 4984 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 4985 /// \param s 4986 /// A literal constant scale factor for the indexes in \a i. Must be 4987 /// 1, 2, 4, or 8. 4988 /// \returns A 128-bit vector of [4 x float] containing the gathered values. 4989 #define _mm256_i64gather_ps(m, i, s) \ 4990 ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ 4991 (float const *)(m), \ 4992 (__v4di)(__m256i)(i), \ 4993 (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 4994 _mm_setzero_ps()), \ 4995 (s))) 4996 4997 /// Gathers four 32-bit floating-point values from memory \a m using scaled 4998 /// indexes from the 128-bit vector of [4 x i32] in \a i. 4999 /// 5000 /// \code{.operation} 5001 /// FOR element := 0 to 3 5002 /// j := element*32 5003 /// k := element*32 5004 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5005 /// ENDFOR 5006 /// \endcode 5007 /// 5008 /// \headerfile <immintrin.h> 5009 /// 5010 /// \code 5011 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s); 5012 /// \endcode 5013 /// 5014 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 5015 /// 5016 /// \param m 5017 /// A pointer to the memory used for loading values. 5018 /// \param i 5019 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5020 /// \param s 5021 /// A literal constant scale factor for the indexes in \a i. Must be 5022 /// 1, 2, 4, or 8. 5023 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5024 #define _mm_i32gather_epi32(m, i, s) \ 5025 ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ 5026 (int const *)(m), (__v4si)(__m128i)(i), \ 5027 (__v4si)_mm_set1_epi32(-1), (s))) 5028 5029 /// Gathers eight 32-bit floating-point values from memory \a m using scaled 5030 /// indexes from the 256-bit vector of [8 x i32] in \a i. 5031 /// 5032 /// \code{.operation} 5033 /// FOR element := 0 to 7 5034 /// j := element*32 5035 /// k := element*32 5036 /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 5037 /// ENDFOR 5038 /// \endcode 5039 /// 5040 /// \headerfile <immintrin.h> 5041 /// 5042 /// \code 5043 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s); 5044 /// \endcode 5045 /// 5046 /// This intrinsic corresponds to the \c VPGATHERDD instruction. 5047 /// 5048 /// \param m 5049 /// A pointer to the memory used for loading values. 5050 /// \param i 5051 /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 5052 /// \param s 5053 /// A literal constant scale factor for the indexes in \a i. Must be 5054 /// 1, 2, 4, or 8. 5055 /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 5056 #define _mm256_i32gather_epi32(m, i, s) \ 5057 ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ 5058 (int const *)(m), (__v8si)(__m256i)(i), \ 5059 (__v8si)_mm256_set1_epi32(-1), (s))) 5060 5061 /// Gathers two 32-bit integer values from memory \a m using scaled indexes 5062 /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements 5063 /// of the result are zeroed. 5064 /// 5065 /// \code{.operation} 5066 /// FOR element := 0 to 1 5067 /// j := element*32 5068 /// k := element*64 5069 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5070 /// ENDFOR 5071 /// result[127:64] := 0 5072 /// \endcode 5073 /// 5074 /// \headerfile <immintrin.h> 5075 /// 5076 /// \code 5077 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s); 5078 /// \endcode 5079 /// 5080 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 5081 /// 5082 /// \param m 5083 /// A pointer to the memory used for loading values. 5084 /// \param i 5085 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5086 /// \param s 5087 /// A literal constant scale factor for the indexes in \a i. Must be 5088 /// 1, 2, 4, or 8. 5089 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5090 #define _mm_i64gather_epi32(m, i, s) \ 5091 ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ 5092 (int const *)(m), (__v2di)(__m128i)(i), \ 5093 (__v4si)_mm_set1_epi32(-1), (s))) 5094 5095 /// Gathers four 32-bit integer values from memory \a m using scaled indexes 5096 /// from the 256-bit vector of [4 x i64] in \a i. 5097 /// 5098 /// \code{.operation} 5099 /// FOR element := 0 to 3 5100 /// j := element*32 5101 /// k := element*64 5102 /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 5103 /// ENDFOR 5104 /// \endcode 5105 /// 5106 /// \headerfile <immintrin.h> 5107 /// 5108 /// \code 5109 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s); 5110 /// \endcode 5111 /// 5112 /// This intrinsic corresponds to the \c VPGATHERQD instruction. 5113 /// 5114 /// \param m 5115 /// A pointer to the memory used for loading values. 5116 /// \param i 5117 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5118 /// \param s 5119 /// A literal constant scale factor for the indexes in \a i. Must be 5120 /// 1, 2, 4, or 8. 5121 /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 5122 #define _mm256_i64gather_epi32(m, i, s) \ 5123 ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ 5124 (int const *)(m), (__v4di)(__m256i)(i), \ 5125 (__v4si)_mm_set1_epi32(-1), (s))) 5126 5127 /// Gathers two 64-bit integer values from memory \a m using scaled indexes 5128 /// from the 128-bit vector of [4 x i32] in \a i. 5129 /// 5130 /// \code{.operation} 5131 /// FOR element := 0 to 1 5132 /// j := element*64 5133 /// k := element*32 5134 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5135 /// ENDFOR 5136 /// \endcode 5137 /// 5138 /// \headerfile <immintrin.h> 5139 /// 5140 /// \code 5141 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s); 5142 /// \endcode 5143 /// 5144 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5145 /// 5146 /// \param m 5147 /// A pointer to the memory used for loading values. 5148 /// \param i 5149 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 5150 /// the first two elements are used. 5151 /// \param s 5152 /// A literal constant scale factor for the indexes in \a i. Must be 5153 /// 1, 2, 4, or 8. 5154 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5155 #define _mm_i32gather_epi64(m, i, s) \ 5156 ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ 5157 (long long const *)(m), \ 5158 (__v4si)(__m128i)(i), \ 5159 (__v2di)_mm_set1_epi64x(-1), (s))) 5160 5161 /// Gathers four 64-bit integer values from memory \a m using scaled indexes 5162 /// from the 128-bit vector of [4 x i32] in \a i. 5163 /// 5164 /// \code{.operation} 5165 /// FOR element := 0 to 3 5166 /// j := element*64 5167 /// k := element*32 5168 /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 5169 /// ENDFOR 5170 /// \endcode 5171 /// 5172 /// \headerfile <immintrin.h> 5173 /// 5174 /// \code 5175 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s); 5176 /// \endcode 5177 /// 5178 /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 5179 /// 5180 /// \param m 5181 /// A pointer to the memory used for loading values. 5182 /// \param i 5183 /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 5184 /// \param s 5185 /// A literal constant scale factor for the indexes in \a i. Must be 5186 /// 1, 2, 4, or 8. 5187 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5188 #define _mm256_i32gather_epi64(m, i, s) \ 5189 ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ 5190 (long long const *)(m), \ 5191 (__v4si)(__m128i)(i), \ 5192 (__v4di)_mm256_set1_epi64x(-1), (s))) 5193 5194 /// Gathers two 64-bit integer values from memory \a m using scaled indexes 5195 /// from the 128-bit vector of [2 x i64] in \a i. 5196 /// 5197 /// \code{.operation} 5198 /// FOR element := 0 to 1 5199 /// j := element*64 5200 /// k := element*64 5201 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5202 /// ENDFOR 5203 /// \endcode 5204 /// 5205 /// \headerfile <immintrin.h> 5206 /// 5207 /// \code 5208 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s); 5209 /// \endcode 5210 /// 5211 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5212 /// 5213 /// \param m 5214 /// A pointer to the memory used for loading values. 5215 /// \param i 5216 /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 5217 /// \param s 5218 /// A literal constant scale factor for the indexes in \a i. Must be 5219 /// 1, 2, 4, or 8. 5220 /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 5221 #define _mm_i64gather_epi64(m, i, s) \ 5222 ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ 5223 (long long const *)(m), \ 5224 (__v2di)(__m128i)(i), \ 5225 (__v2di)_mm_set1_epi64x(-1), (s))) 5226 5227 /// Gathers four 64-bit integer values from memory \a m using scaled indexes 5228 /// from the 256-bit vector of [4 x i64] in \a i. 5229 /// 5230 /// \code{.operation} 5231 /// FOR element := 0 to 3 5232 /// j := element*64 5233 /// k := element*64 5234 /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 5235 /// ENDFOR 5236 /// \endcode 5237 /// 5238 /// \headerfile <immintrin.h> 5239 /// 5240 /// \code 5241 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s); 5242 /// \endcode 5243 /// 5244 /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 5245 /// 5246 /// \param m 5247 /// A pointer to the memory used for loading values. 5248 /// \param i 5249 /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 5250 /// \param s 5251 /// A literal constant scale factor for the indexes in \a i. Must be 5252 /// 1, 2, 4, or 8. 5253 /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 5254 #define _mm256_i64gather_epi64(m, i, s) \ 5255 ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ 5256 (long long const *)(m), \ 5257 (__v4di)(__m256i)(i), \ 5258 (__v4di)_mm256_set1_epi64x(-1), (s))) 5259 5260 #undef __DEFAULT_FN_ATTRS256 5261 #undef __DEFAULT_FN_ATTRS128 5262 5263 #endif /* __AVX2INTRIN_H */ 5264