xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx2intrin.h (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
10b57cec5SDimitry Andric /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric  *
70b57cec5SDimitry Andric  *===-----------------------------------------------------------------------===
80b57cec5SDimitry Andric  */
90b57cec5SDimitry Andric 
100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H
110b57cec5SDimitry Andric #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
120b57cec5SDimitry Andric #endif
130b57cec5SDimitry Andric 
140b57cec5SDimitry Andric #ifndef __AVX2INTRIN_H
150b57cec5SDimitry Andric #define __AVX2INTRIN_H
160b57cec5SDimitry Andric 
170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */
18*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS256                                                  \
19*5f757f3fSDimitry Andric   __attribute__((__always_inline__, __nodebug__,                               \
20*5f757f3fSDimitry Andric                  __target__("avx2,no-evex512"), __min_vector_width__(256)))
21*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS128                                                  \
22*5f757f3fSDimitry Andric   __attribute__((__always_inline__, __nodebug__,                               \
23*5f757f3fSDimitry Andric                  __target__("avx2,no-evex512"), __min_vector_width__(128)))
240b57cec5SDimitry Andric 
250b57cec5SDimitry Andric /* SSE4 Multiple Packed Sums of Absolute Difference.  */
2606c3fb27SDimitry Andric /// Computes sixteen sum of absolute difference (SAD) operations on sets of
2706c3fb27SDimitry Andric ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
2806c3fb27SDimitry Andric ///    \a Y.
2906c3fb27SDimitry Andric ///
3006c3fb27SDimitry Andric ///    Eight SAD results are computed using the lower half of the input
3106c3fb27SDimitry Andric ///    vectors, and another eight using the upper half. These 16-bit values
3206c3fb27SDimitry Andric ///    are returned in the lower and upper halves of the 256-bit result,
3306c3fb27SDimitry Andric ///    respectively.
3406c3fb27SDimitry Andric ///
3506c3fb27SDimitry Andric ///    A single SAD operation selects four bytes from \a X and four bytes from
3606c3fb27SDimitry Andric ///    \a Y as input. It computes the differences between each \a X byte and
3706c3fb27SDimitry Andric ///    the corresponding \a Y byte, takes the absolute value of each
3806c3fb27SDimitry Andric ///    difference, and sums these four values to form one 16-bit result. The
3906c3fb27SDimitry Andric ///    intrinsic computes 16 of these results with different sets of input
4006c3fb27SDimitry Andric ///    bytes.
4106c3fb27SDimitry Andric ///
4206c3fb27SDimitry Andric ///    For each set of eight results, the SAD operations use the same four
4306c3fb27SDimitry Andric ///    bytes from \a Y; the starting bit position for these four bytes is
4406c3fb27SDimitry Andric ///    specified by \a M[1:0] times 32. The eight operations use successive
4506c3fb27SDimitry Andric ///    sets of four bytes from \a X; the starting bit position for the first
4606c3fb27SDimitry Andric ///    set of four bytes is specified by \a M[2] times 32. These bit positions
4706c3fb27SDimitry Andric ///    are all relative to the 128-bit lane for each set of eight operations.
4806c3fb27SDimitry Andric ///
4906c3fb27SDimitry Andric /// \code{.operation}
5006c3fb27SDimitry Andric /// r := 0
5106c3fb27SDimitry Andric /// FOR i := 0 TO 1
5206c3fb27SDimitry Andric ///   j := i*3
5306c3fb27SDimitry Andric ///   Ybase := M[j+1:j]*32 + i*128
5406c3fb27SDimitry Andric ///   Xbase := M[j+2]*32 + i*128
5506c3fb27SDimitry Andric ///   FOR k := 0 TO 3
5606c3fb27SDimitry Andric ///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
5706c3fb27SDimitry Andric ///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
5806c3fb27SDimitry Andric ///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
5906c3fb27SDimitry Andric ///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
6006c3fb27SDimitry Andric ///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
6106c3fb27SDimitry Andric ///     Xbase := Xbase + 8
6206c3fb27SDimitry Andric ///     r := r + 16
6306c3fb27SDimitry Andric ///   ENDFOR
6406c3fb27SDimitry Andric /// ENDFOR
6506c3fb27SDimitry Andric /// \endcode
6606c3fb27SDimitry Andric ///
6706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
6806c3fb27SDimitry Andric ///
6906c3fb27SDimitry Andric /// \code
7006c3fb27SDimitry Andric /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
7106c3fb27SDimitry Andric /// \endcode
7206c3fb27SDimitry Andric ///
7306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VMPSADBW instruction.
7406c3fb27SDimitry Andric ///
7506c3fb27SDimitry Andric /// \param X
7606c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the inputs.
7706c3fb27SDimitry Andric /// \param Y
7806c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the inputs.
7906c3fb27SDimitry Andric /// \param M
8006c3fb27SDimitry Andric ///     An unsigned immediate value specifying the starting positions of the
8106c3fb27SDimitry Andric ///     bytes to operate on.
8206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
830b57cec5SDimitry Andric #define _mm256_mpsadbw_epu8(X, Y, M) \
84349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85349cc55cSDimitry Andric                                       (__v32qi)(__m256i)(Y), (int)(M)))
860b57cec5SDimitry Andric 
8706c3fb27SDimitry Andric /// Computes the absolute value of each signed byte in the 256-bit integer
8806c3fb27SDimitry Andric ///    vector \a __a and returns each value in the corresponding byte of
8906c3fb27SDimitry Andric ///    the result.
9006c3fb27SDimitry Andric ///
9106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
9206c3fb27SDimitry Andric ///
9306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPABSB instruction.
9406c3fb27SDimitry Andric ///
9506c3fb27SDimitry Andric /// \param __a
9606c3fb27SDimitry Andric ///    A 256-bit integer vector.
9706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
980b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
990b57cec5SDimitry Andric _mm256_abs_epi8(__m256i __a)
1000b57cec5SDimitry Andric {
10104eeddc0SDimitry Andric     return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
1020b57cec5SDimitry Andric }
1030b57cec5SDimitry Andric 
10406c3fb27SDimitry Andric /// Computes the absolute value of each signed 16-bit element in the 256-bit
10506c3fb27SDimitry Andric ///    vector of [16 x i16] in \a __a and returns each value in the
10606c3fb27SDimitry Andric ///    corresponding element of the result.
10706c3fb27SDimitry Andric ///
10806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
10906c3fb27SDimitry Andric ///
11006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPABSW instruction.
11106c3fb27SDimitry Andric ///
11206c3fb27SDimitry Andric /// \param __a
11306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
11406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
1150b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1160b57cec5SDimitry Andric _mm256_abs_epi16(__m256i __a)
1170b57cec5SDimitry Andric {
11804eeddc0SDimitry Andric     return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
1190b57cec5SDimitry Andric }
1200b57cec5SDimitry Andric 
12106c3fb27SDimitry Andric /// Computes the absolute value of each signed 32-bit element in the 256-bit
12206c3fb27SDimitry Andric ///    vector of [8 x i32] in \a __a and returns each value in the
12306c3fb27SDimitry Andric ///    corresponding element of the result.
12406c3fb27SDimitry Andric ///
12506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
12606c3fb27SDimitry Andric ///
12706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPABSD instruction.
12806c3fb27SDimitry Andric ///
12906c3fb27SDimitry Andric /// \param __a
13006c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
13106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
1320b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1330b57cec5SDimitry Andric _mm256_abs_epi32(__m256i __a)
1340b57cec5SDimitry Andric {
13504eeddc0SDimitry Andric     return (__m256i)__builtin_elementwise_abs((__v8si)__a);
1360b57cec5SDimitry Andric }
1370b57cec5SDimitry Andric 
13806c3fb27SDimitry Andric /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
13906c3fb27SDimitry Andric ///    integers using signed saturation, and returns the 256-bit result.
14006c3fb27SDimitry Andric ///
14106c3fb27SDimitry Andric /// \code{.operation}
14206c3fb27SDimitry Andric /// FOR i := 0 TO 7
14306c3fb27SDimitry Andric ///   j := i*16
14406c3fb27SDimitry Andric ///   k := i*8
14506c3fb27SDimitry Andric ///   result[7+k:k] := SATURATE8(__a[15+j:j])
14606c3fb27SDimitry Andric ///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
14706c3fb27SDimitry Andric ///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
14806c3fb27SDimitry Andric ///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
14906c3fb27SDimitry Andric /// ENDFOR
15006c3fb27SDimitry Andric /// \endcode
15106c3fb27SDimitry Andric ///
15206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
15306c3fb27SDimitry Andric ///
15406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKSSWB instruction.
15506c3fb27SDimitry Andric ///
15606c3fb27SDimitry Andric /// \param __a
15706c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
15806c3fb27SDimitry Andric ///    result[191:128].
15906c3fb27SDimitry Andric /// \param __b
16006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
16106c3fb27SDimitry Andric ///    result[255:192].
16206c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
1630b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1640b57cec5SDimitry Andric _mm256_packs_epi16(__m256i __a, __m256i __b)
1650b57cec5SDimitry Andric {
1660b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
1670b57cec5SDimitry Andric }
1680b57cec5SDimitry Andric 
16906c3fb27SDimitry Andric /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
17006c3fb27SDimitry Andric ///    integers using signed saturation, and returns the resulting 256-bit
17106c3fb27SDimitry Andric ///    vector of [16 x i16].
17206c3fb27SDimitry Andric ///
17306c3fb27SDimitry Andric /// \code{.operation}
17406c3fb27SDimitry Andric /// FOR i := 0 TO 3
17506c3fb27SDimitry Andric ///   j := i*32
17606c3fb27SDimitry Andric ///   k := i*16
17706c3fb27SDimitry Andric ///   result[15+k:k] := SATURATE16(__a[31+j:j])
17806c3fb27SDimitry Andric ///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
17906c3fb27SDimitry Andric ///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
18006c3fb27SDimitry Andric ///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
18106c3fb27SDimitry Andric /// ENDFOR
18206c3fb27SDimitry Andric /// \endcode
18306c3fb27SDimitry Andric ///
18406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
18506c3fb27SDimitry Andric ///
18606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKSSDW instruction.
18706c3fb27SDimitry Andric ///
18806c3fb27SDimitry Andric /// \param __a
18906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
19006c3fb27SDimitry Andric ///    result[191:128].
19106c3fb27SDimitry Andric /// \param __b
19206c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
19306c3fb27SDimitry Andric ///    result[255:192].
19406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
1950b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1960b57cec5SDimitry Andric _mm256_packs_epi32(__m256i __a, __m256i __b)
1970b57cec5SDimitry Andric {
1980b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
1990b57cec5SDimitry Andric }
2000b57cec5SDimitry Andric 
20106c3fb27SDimitry Andric /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
20206c3fb27SDimitry Andric ///    using unsigned saturation, and returns the 256-bit result.
20306c3fb27SDimitry Andric ///
20406c3fb27SDimitry Andric /// \code{.operation}
20506c3fb27SDimitry Andric /// FOR i := 0 TO 7
20606c3fb27SDimitry Andric ///   j := i*16
20706c3fb27SDimitry Andric ///   k := i*8
20806c3fb27SDimitry Andric ///   result[7+k:k] := SATURATE8U(__a[15+j:j])
20906c3fb27SDimitry Andric ///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
21006c3fb27SDimitry Andric ///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
21106c3fb27SDimitry Andric ///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
21206c3fb27SDimitry Andric /// ENDFOR
21306c3fb27SDimitry Andric /// \endcode
21406c3fb27SDimitry Andric ///
21506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
21606c3fb27SDimitry Andric ///
21706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKUSWB instruction.
21806c3fb27SDimitry Andric ///
21906c3fb27SDimitry Andric /// \param __a
22006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
22106c3fb27SDimitry Andric ///    result[191:128].
22206c3fb27SDimitry Andric /// \param __b
22306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
22406c3fb27SDimitry Andric ///    result[255:192].
22506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
2260b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2270b57cec5SDimitry Andric _mm256_packus_epi16(__m256i __a, __m256i __b)
2280b57cec5SDimitry Andric {
2290b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
2300b57cec5SDimitry Andric }
2310b57cec5SDimitry Andric 
23206c3fb27SDimitry Andric /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
23306c3fb27SDimitry Andric ///    using unsigned saturation, and returns the resulting 256-bit vector of
23406c3fb27SDimitry Andric ///    [16 x i16].
23506c3fb27SDimitry Andric ///
23606c3fb27SDimitry Andric /// \code{.operation}
23706c3fb27SDimitry Andric /// FOR i := 0 TO 3
23806c3fb27SDimitry Andric ///   j := i*32
23906c3fb27SDimitry Andric ///   k := i*16
24006c3fb27SDimitry Andric ///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
24106c3fb27SDimitry Andric ///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
24206c3fb27SDimitry Andric ///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
24306c3fb27SDimitry Andric ///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
24406c3fb27SDimitry Andric /// ENDFOR
24506c3fb27SDimitry Andric /// \endcode
24606c3fb27SDimitry Andric ///
24706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
24806c3fb27SDimitry Andric ///
24906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKUSDW instruction.
25006c3fb27SDimitry Andric ///
25106c3fb27SDimitry Andric /// \param __V1
25206c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
25306c3fb27SDimitry Andric ///    result[191:128].
25406c3fb27SDimitry Andric /// \param __V2
25506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
25606c3fb27SDimitry Andric ///    result[255:192].
25706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
2580b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2590b57cec5SDimitry Andric _mm256_packus_epi32(__m256i __V1, __m256i __V2)
2600b57cec5SDimitry Andric {
2610b57cec5SDimitry Andric   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
2620b57cec5SDimitry Andric }
2630b57cec5SDimitry Andric 
26406c3fb27SDimitry Andric /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
26506c3fb27SDimitry Andric ///    vectors and returns the lower 8 bits of each sum in the corresponding
26606c3fb27SDimitry Andric ///    byte of the 256-bit integer vector result (overflow is ignored).
26706c3fb27SDimitry Andric ///
26806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
26906c3fb27SDimitry Andric ///
27006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDB instruction.
27106c3fb27SDimitry Andric ///
27206c3fb27SDimitry Andric /// \param __a
27306c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the source operands.
27406c3fb27SDimitry Andric /// \param __b
27506c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the source operands.
27606c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the sums.
2770b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2780b57cec5SDimitry Andric _mm256_add_epi8(__m256i __a, __m256i __b)
2790b57cec5SDimitry Andric {
2800b57cec5SDimitry Andric   return (__m256i)((__v32qu)__a + (__v32qu)__b);
2810b57cec5SDimitry Andric }
2820b57cec5SDimitry Andric 
28306c3fb27SDimitry Andric /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
28406c3fb27SDimitry Andric ///    [16 x i16] and returns the lower 16 bits of each sum in the
28506c3fb27SDimitry Andric ///    corresponding element of the [16 x i16] result (overflow is ignored).
28606c3fb27SDimitry Andric ///
28706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
28806c3fb27SDimitry Andric ///
28906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDW instruction.
29006c3fb27SDimitry Andric ///
29106c3fb27SDimitry Andric /// \param __a
29206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
29306c3fb27SDimitry Andric /// \param __b
29406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
29506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums.
2960b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2970b57cec5SDimitry Andric _mm256_add_epi16(__m256i __a, __m256i __b)
2980b57cec5SDimitry Andric {
2990b57cec5SDimitry Andric   return (__m256i)((__v16hu)__a + (__v16hu)__b);
3000b57cec5SDimitry Andric }
3010b57cec5SDimitry Andric 
30206c3fb27SDimitry Andric /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
30306c3fb27SDimitry Andric ///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
30406c3fb27SDimitry Andric ///    element of the [8 x i32] result (overflow is ignored).
30506c3fb27SDimitry Andric ///
30606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
30706c3fb27SDimitry Andric ///
30806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDD instruction.
30906c3fb27SDimitry Andric ///
31006c3fb27SDimitry Andric /// \param __a
31106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
31206c3fb27SDimitry Andric /// \param __b
31306c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
31406c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sums.
3150b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
3160b57cec5SDimitry Andric _mm256_add_epi32(__m256i __a, __m256i __b)
3170b57cec5SDimitry Andric {
3180b57cec5SDimitry Andric   return (__m256i)((__v8su)__a + (__v8su)__b);
3190b57cec5SDimitry Andric }
3200b57cec5SDimitry Andric 
32106c3fb27SDimitry Andric /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
32206c3fb27SDimitry Andric ///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
32306c3fb27SDimitry Andric ///    element of the [4 x i64] result (overflow is ignored).
32406c3fb27SDimitry Andric ///
32506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
32606c3fb27SDimitry Andric ///
32706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDQ instruction.
32806c3fb27SDimitry Andric ///
32906c3fb27SDimitry Andric /// \param __a
33006c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing one of the source operands.
33106c3fb27SDimitry Andric /// \param __b
33206c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing one of the source operands.
33306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sums.
3340b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
3350b57cec5SDimitry Andric _mm256_add_epi64(__m256i __a, __m256i __b)
3360b57cec5SDimitry Andric {
3370b57cec5SDimitry Andric   return (__m256i)((__v4du)__a + (__v4du)__b);
3380b57cec5SDimitry Andric }
3390b57cec5SDimitry Andric 
34006c3fb27SDimitry Andric /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
34106c3fb27SDimitry Andric ///    vectors using signed saturation, and returns each sum in the
34206c3fb27SDimitry Andric ///    corresponding byte of the 256-bit integer vector result.
34306c3fb27SDimitry Andric ///
34406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
34506c3fb27SDimitry Andric ///
34606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDSB instruction.
34706c3fb27SDimitry Andric ///
34806c3fb27SDimitry Andric /// \param __a
34906c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the source operands.
35006c3fb27SDimitry Andric /// \param __b
35106c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the source operands.
35206c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the sums.
3530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
3540b57cec5SDimitry Andric _mm256_adds_epi8(__m256i __a, __m256i __b)
3550b57cec5SDimitry Andric {
35681ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
3570b57cec5SDimitry Andric }
3580b57cec5SDimitry Andric 
35906c3fb27SDimitry Andric /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
36006c3fb27SDimitry Andric ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
36106c3fb27SDimitry Andric ///
36206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
36306c3fb27SDimitry Andric ///
36406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDSW instruction.
36506c3fb27SDimitry Andric ///
36606c3fb27SDimitry Andric /// \param __a
36706c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
36806c3fb27SDimitry Andric /// \param __b
36906c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
37006c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums.
3710b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
3720b57cec5SDimitry Andric _mm256_adds_epi16(__m256i __a, __m256i __b)
3730b57cec5SDimitry Andric {
37481ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
3750b57cec5SDimitry Andric }
3760b57cec5SDimitry Andric 
37706c3fb27SDimitry Andric /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
37806c3fb27SDimitry Andric ///    vectors using unsigned saturation, and returns each sum in the
37906c3fb27SDimitry Andric ///    corresponding byte of the 256-bit integer vector result.
38006c3fb27SDimitry Andric ///
38106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
38206c3fb27SDimitry Andric ///
38306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDUSB instruction.
38406c3fb27SDimitry Andric ///
38506c3fb27SDimitry Andric /// \param __a
38606c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the source operands.
38706c3fb27SDimitry Andric /// \param __b
38806c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the source operands.
38906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the sums.
3900b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
3910b57cec5SDimitry Andric _mm256_adds_epu8(__m256i __a, __m256i __b)
3920b57cec5SDimitry Andric {
39381ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
3940b57cec5SDimitry Andric }
3950b57cec5SDimitry Andric 
39606c3fb27SDimitry Andric /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
39706c3fb27SDimitry Andric ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
39806c3fb27SDimitry Andric ///
39906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
40006c3fb27SDimitry Andric ///
40106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDUSW instruction.
40206c3fb27SDimitry Andric ///
40306c3fb27SDimitry Andric /// \param __a
40406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
40506c3fb27SDimitry Andric /// \param __b
40606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
40706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums.
4080b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
4090b57cec5SDimitry Andric _mm256_adds_epu16(__m256i __a, __m256i __b)
4100b57cec5SDimitry Andric {
41181ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
4120b57cec5SDimitry Andric }
4130b57cec5SDimitry Andric 
41406c3fb27SDimitry Andric /// Uses the lower half of the 256-bit vector \a a as the upper half of a
41506c3fb27SDimitry Andric ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
41606c3fb27SDimitry Andric ///    as the lower half of the temporary value. Right-shifts the temporary
41706c3fb27SDimitry Andric ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
41806c3fb27SDimitry Andric ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
41906c3fb27SDimitry Andric ///    \a b to make another temporary value, right shifts by \a n, and uses
42006c3fb27SDimitry Andric ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
42106c3fb27SDimitry Andric ///    result.
42206c3fb27SDimitry Andric ///
42306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
42406c3fb27SDimitry Andric ///
42506c3fb27SDimitry Andric /// \code
42606c3fb27SDimitry Andric /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
42706c3fb27SDimitry Andric /// \endcode
42806c3fb27SDimitry Andric ///
42906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPALIGNR instruction.
43006c3fb27SDimitry Andric ///
43106c3fb27SDimitry Andric /// \param a
43206c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
43306c3fb27SDimitry Andric /// \param b
43406c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
43506c3fb27SDimitry Andric /// \param n
43606c3fb27SDimitry Andric ///    An immediate value specifying the number of bytes to shift.
43706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
4380b57cec5SDimitry Andric #define _mm256_alignr_epi8(a, b, n) \
439349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440349cc55cSDimitry Andric                                       (__v32qi)(__m256i)(b), (n)))
4410b57cec5SDimitry Andric 
44206c3fb27SDimitry Andric /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
44306c3fb27SDimitry Andric ///    \a __b.
44406c3fb27SDimitry Andric ///
44506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
44606c3fb27SDimitry Andric ///
44706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPAND instruction.
44806c3fb27SDimitry Andric ///
44906c3fb27SDimitry Andric /// \param __a
45006c3fb27SDimitry Andric ///    A 256-bit integer vector.
45106c3fb27SDimitry Andric /// \param __b
45206c3fb27SDimitry Andric ///    A 256-bit integer vector.
45306c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
4540b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
4550b57cec5SDimitry Andric _mm256_and_si256(__m256i __a, __m256i __b)
4560b57cec5SDimitry Andric {
4570b57cec5SDimitry Andric   return (__m256i)((__v4du)__a & (__v4du)__b);
4580b57cec5SDimitry Andric }
4590b57cec5SDimitry Andric 
46006c3fb27SDimitry Andric /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
46106c3fb27SDimitry Andric ///    the bitwise NOT of the 256-bit integer vector in \a __a.
46206c3fb27SDimitry Andric ///
46306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
46406c3fb27SDimitry Andric ///
46506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPANDN instruction.
46606c3fb27SDimitry Andric ///
46706c3fb27SDimitry Andric /// \param __a
46806c3fb27SDimitry Andric ///    A 256-bit integer vector.
46906c3fb27SDimitry Andric /// \param __b
47006c3fb27SDimitry Andric ///    A 256-bit integer vector.
47106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
4720b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
4730b57cec5SDimitry Andric _mm256_andnot_si256(__m256i __a, __m256i __b)
4740b57cec5SDimitry Andric {
4750b57cec5SDimitry Andric   return (__m256i)(~(__v4du)__a & (__v4du)__b);
4760b57cec5SDimitry Andric }
4770b57cec5SDimitry Andric 
47806c3fb27SDimitry Andric /// Computes the averages of the corresponding unsigned bytes in the two
47906c3fb27SDimitry Andric ///    256-bit integer vectors in \a __a and \a __b and returns each
48006c3fb27SDimitry Andric ///    average in the corresponding byte of the 256-bit result.
48106c3fb27SDimitry Andric ///
48206c3fb27SDimitry Andric /// \code{.operation}
48306c3fb27SDimitry Andric /// FOR i := 0 TO 31
48406c3fb27SDimitry Andric ///   j := i*8
48506c3fb27SDimitry Andric ///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
48606c3fb27SDimitry Andric /// ENDFOR
48706c3fb27SDimitry Andric /// \endcode
48806c3fb27SDimitry Andric ///
48906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
49006c3fb27SDimitry Andric ///
49106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPAVGB instruction.
49206c3fb27SDimitry Andric ///
49306c3fb27SDimitry Andric /// \param __a
49406c3fb27SDimitry Andric ///    A 256-bit integer vector.
49506c3fb27SDimitry Andric /// \param __b
49606c3fb27SDimitry Andric ///    A 256-bit integer vector.
49706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
4980b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
4990b57cec5SDimitry Andric _mm256_avg_epu8(__m256i __a, __m256i __b)
5000b57cec5SDimitry Andric {
5010b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
5020b57cec5SDimitry Andric }
5030b57cec5SDimitry Andric 
50406c3fb27SDimitry Andric /// Computes the averages of the corresponding unsigned 16-bit integers in
50506c3fb27SDimitry Andric ///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
50606c3fb27SDimitry Andric ///    each average in the corresponding element of the 256-bit result.
50706c3fb27SDimitry Andric ///
50806c3fb27SDimitry Andric /// \code{.operation}
50906c3fb27SDimitry Andric /// FOR i := 0 TO 15
51006c3fb27SDimitry Andric ///   j := i*16
51106c3fb27SDimitry Andric ///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
51206c3fb27SDimitry Andric /// ENDFOR
51306c3fb27SDimitry Andric /// \endcode
51406c3fb27SDimitry Andric ///
51506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
51606c3fb27SDimitry Andric ///
51706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPAVGW instruction.
51806c3fb27SDimitry Andric ///
51906c3fb27SDimitry Andric /// \param __a
52006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
52106c3fb27SDimitry Andric /// \param __b
52206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
52306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
5240b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
5250b57cec5SDimitry Andric _mm256_avg_epu16(__m256i __a, __m256i __b)
5260b57cec5SDimitry Andric {
5270b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
5280b57cec5SDimitry Andric }
5290b57cec5SDimitry Andric 
53006c3fb27SDimitry Andric /// Merges 8-bit integer values from either of the two 256-bit vectors
53106c3fb27SDimitry Andric ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
53206c3fb27SDimitry Andric ///    the resulting 256-bit integer vector.
53306c3fb27SDimitry Andric ///
53406c3fb27SDimitry Andric /// \code{.operation}
53506c3fb27SDimitry Andric /// FOR i := 0 TO 31
53606c3fb27SDimitry Andric ///   j := i*8
53706c3fb27SDimitry Andric ///   IF __M[7+i] == 0
53806c3fb27SDimitry Andric ///     result[7+j:j] := __V1[7+j:j]
53906c3fb27SDimitry Andric ///   ELSE
54006c3fb27SDimitry Andric ///     result[7+j:j] := __V2[7+j:j]
54106c3fb27SDimitry Andric ///   FI
54206c3fb27SDimitry Andric /// ENDFOR
54306c3fb27SDimitry Andric /// \endcode
54406c3fb27SDimitry Andric ///
54506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
54606c3fb27SDimitry Andric ///
54706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDVB instruction.
54806c3fb27SDimitry Andric ///
54906c3fb27SDimitry Andric /// \param __V1
55006c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
55106c3fb27SDimitry Andric /// \param __V2
55206c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
55306c3fb27SDimitry Andric /// \param __M
55406c3fb27SDimitry Andric ///    A 256-bit integer vector, with bit [7] of each byte specifying the
55506c3fb27SDimitry Andric ///    source for each corresponding byte of the result. When the mask bit
55606c3fb27SDimitry Andric ///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
55706c3fb27SDimitry Andric ///    \a __V2.
55806c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
5590b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
5600b57cec5SDimitry Andric _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
5610b57cec5SDimitry Andric {
5620b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
5630b57cec5SDimitry Andric                                               (__v32qi)__M);
5640b57cec5SDimitry Andric }
5650b57cec5SDimitry Andric 
56606c3fb27SDimitry Andric /// Merges 16-bit integer values from either of the two 256-bit vectors
56706c3fb27SDimitry Andric ///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
56806c3fb27SDimitry Andric ///    and returns the resulting 256-bit vector of [16 x i16].
56906c3fb27SDimitry Andric ///
57006c3fb27SDimitry Andric /// \code{.operation}
57106c3fb27SDimitry Andric /// FOR i := 0 TO 7
57206c3fb27SDimitry Andric ///   j := i*16
57306c3fb27SDimitry Andric ///   IF M[i] == 0
57406c3fb27SDimitry Andric ///     result[7+j:j] := V1[7+j:j]
57506c3fb27SDimitry Andric ///     result[135+j:128+j] := V1[135+j:128+j]
57606c3fb27SDimitry Andric ///   ELSE
57706c3fb27SDimitry Andric ///     result[7+j:j] := V2[7+j:j]
57806c3fb27SDimitry Andric ///     result[135+j:128+j] := V2[135+j:128+j]
57906c3fb27SDimitry Andric ///   FI
58006c3fb27SDimitry Andric /// ENDFOR
58106c3fb27SDimitry Andric /// \endcode
58206c3fb27SDimitry Andric ///
58306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
58406c3fb27SDimitry Andric ///
58506c3fb27SDimitry Andric /// \code
58606c3fb27SDimitry Andric /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
58706c3fb27SDimitry Andric /// \endcode
58806c3fb27SDimitry Andric ///
58906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDW instruction.
59006c3fb27SDimitry Andric ///
59106c3fb27SDimitry Andric /// \param V1
59206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing source values.
59306c3fb27SDimitry Andric /// \param V2
59406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing source values.
59506c3fb27SDimitry Andric /// \param M
59606c3fb27SDimitry Andric ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
59706c3fb27SDimitry Andric ///    source for each element of the result. The position of the mask bit
59806c3fb27SDimitry Andric ///    corresponds to the index of a copied value. When a mask bit is 0, the
59906c3fb27SDimitry Andric ///    element is copied from \a V1; otherwise, it is copied from \a V2.
60006c3fb27SDimitry Andric ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
60106c3fb27SDimitry Andric ///    elements 1 and 9, and so forth.
60206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
6030b57cec5SDimitry Andric #define _mm256_blend_epi16(V1, V2, M) \
604349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605349cc55cSDimitry Andric                                       (__v16hi)(__m256i)(V2), (int)(M)))
6060b57cec5SDimitry Andric 
60706c3fb27SDimitry Andric /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
60806c3fb27SDimitry Andric ///    \a __b for equality and returns the outcomes in the corresponding
60906c3fb27SDimitry Andric ///    bytes of the 256-bit result.
61006c3fb27SDimitry Andric ///
61106c3fb27SDimitry Andric /// \code{.operation}
61206c3fb27SDimitry Andric /// FOR i := 0 TO 31
61306c3fb27SDimitry Andric ///   j := i*8
61406c3fb27SDimitry Andric ///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
61506c3fb27SDimitry Andric /// ENDFOR
61606c3fb27SDimitry Andric /// \endcode
61706c3fb27SDimitry Andric ///
61806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
61906c3fb27SDimitry Andric ///
62006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQB instruction.
62106c3fb27SDimitry Andric ///
62206c3fb27SDimitry Andric /// \param __a
62306c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the inputs.
62406c3fb27SDimitry Andric /// \param __b
62506c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the inputs.
62606c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
6270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
6280b57cec5SDimitry Andric _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
6290b57cec5SDimitry Andric {
6300b57cec5SDimitry Andric   return (__m256i)((__v32qi)__a == (__v32qi)__b);
6310b57cec5SDimitry Andric }
6320b57cec5SDimitry Andric 
63306c3fb27SDimitry Andric /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
63406c3fb27SDimitry Andric ///    \a __a and \a __b for equality and returns the outcomes in the
63506c3fb27SDimitry Andric ///    corresponding elements of the 256-bit result.
63606c3fb27SDimitry Andric ///
63706c3fb27SDimitry Andric /// \code{.operation}
63806c3fb27SDimitry Andric /// FOR i := 0 TO 15
63906c3fb27SDimitry Andric ///   j := i*16
64006c3fb27SDimitry Andric ///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
64106c3fb27SDimitry Andric /// ENDFOR
64206c3fb27SDimitry Andric /// \endcode
64306c3fb27SDimitry Andric ///
64406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
64506c3fb27SDimitry Andric ///
64606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQW instruction.
64706c3fb27SDimitry Andric ///
64806c3fb27SDimitry Andric /// \param __a
64906c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the inputs.
65006c3fb27SDimitry Andric /// \param __b
65106c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the inputs.
65206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
6530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
6540b57cec5SDimitry Andric _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
6550b57cec5SDimitry Andric {
6560b57cec5SDimitry Andric   return (__m256i)((__v16hi)__a == (__v16hi)__b);
6570b57cec5SDimitry Andric }
6580b57cec5SDimitry Andric 
65906c3fb27SDimitry Andric /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
66006c3fb27SDimitry Andric ///    \a __a and \a __b for equality and returns the outcomes in the
66106c3fb27SDimitry Andric ///    corresponding elements of the 256-bit result.
66206c3fb27SDimitry Andric ///
66306c3fb27SDimitry Andric /// \code{.operation}
66406c3fb27SDimitry Andric /// FOR i := 0 TO 7
66506c3fb27SDimitry Andric ///   j := i*32
66606c3fb27SDimitry Andric ///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
66706c3fb27SDimitry Andric /// ENDFOR
66806c3fb27SDimitry Andric /// \endcode
66906c3fb27SDimitry Andric ///
67006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
67106c3fb27SDimitry Andric ///
67206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQD instruction.
67306c3fb27SDimitry Andric ///
67406c3fb27SDimitry Andric /// \param __a
67506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the inputs.
67606c3fb27SDimitry Andric /// \param __b
67706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the inputs.
67806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
6790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
6800b57cec5SDimitry Andric _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
6810b57cec5SDimitry Andric {
6820b57cec5SDimitry Andric   return (__m256i)((__v8si)__a == (__v8si)__b);
6830b57cec5SDimitry Andric }
6840b57cec5SDimitry Andric 
68506c3fb27SDimitry Andric /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
68606c3fb27SDimitry Andric ///    \a __a and \a __b for equality and returns the outcomes in the
68706c3fb27SDimitry Andric ///    corresponding elements of the 256-bit result.
68806c3fb27SDimitry Andric ///
68906c3fb27SDimitry Andric /// \code{.operation}
69006c3fb27SDimitry Andric /// FOR i := 0 TO 3
69106c3fb27SDimitry Andric ///   j := i*64
69206c3fb27SDimitry Andric ///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
69306c3fb27SDimitry Andric /// ENDFOR
69406c3fb27SDimitry Andric /// \endcode
69506c3fb27SDimitry Andric ///
69606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
69706c3fb27SDimitry Andric ///
69806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
69906c3fb27SDimitry Andric ///
70006c3fb27SDimitry Andric /// \param __a
70106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing one of the inputs.
70206c3fb27SDimitry Andric /// \param __b
70306c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing one of the inputs.
70406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
7050b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
7060b57cec5SDimitry Andric _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
7070b57cec5SDimitry Andric {
7080b57cec5SDimitry Andric   return (__m256i)((__v4di)__a == (__v4di)__b);
7090b57cec5SDimitry Andric }
7100b57cec5SDimitry Andric 
71106c3fb27SDimitry Andric /// Compares corresponding signed bytes in the 256-bit integer vectors in
71206c3fb27SDimitry Andric ///    \a __a and \a __b for greater-than and returns the outcomes in the
71306c3fb27SDimitry Andric ///    corresponding bytes of the 256-bit result.
71406c3fb27SDimitry Andric ///
71506c3fb27SDimitry Andric /// \code{.operation}
71606c3fb27SDimitry Andric /// FOR i := 0 TO 31
71706c3fb27SDimitry Andric ///   j := i*8
71806c3fb27SDimitry Andric ///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
71906c3fb27SDimitry Andric /// ENDFOR
72006c3fb27SDimitry Andric /// \endcode
72106c3fb27SDimitry Andric ///
72206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
72306c3fb27SDimitry Andric ///
72406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTB instruction.
72506c3fb27SDimitry Andric ///
72606c3fb27SDimitry Andric /// \param __a
72706c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the inputs.
72806c3fb27SDimitry Andric /// \param __b
72906c3fb27SDimitry Andric ///    A 256-bit integer vector containing one of the inputs.
73006c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
7310b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
7320b57cec5SDimitry Andric _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
7330b57cec5SDimitry Andric {
7340b57cec5SDimitry Andric   /* This function always performs a signed comparison, but __v32qi is a char
7350b57cec5SDimitry Andric      which may be signed or unsigned, so use __v32qs. */
7360b57cec5SDimitry Andric   return (__m256i)((__v32qs)__a > (__v32qs)__b);
7370b57cec5SDimitry Andric }
7380b57cec5SDimitry Andric 
73906c3fb27SDimitry Andric /// Compares corresponding signed elements in the 256-bit vectors of
74006c3fb27SDimitry Andric ///    [16 x i16] in \a __a and \a __b for greater-than and returns the
74106c3fb27SDimitry Andric ///    outcomes in the corresponding elements of the 256-bit result.
74206c3fb27SDimitry Andric ///
74306c3fb27SDimitry Andric /// \code{.operation}
74406c3fb27SDimitry Andric /// FOR i := 0 TO 15
74506c3fb27SDimitry Andric ///   j := i*16
74606c3fb27SDimitry Andric ///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
74706c3fb27SDimitry Andric /// ENDFOR
74806c3fb27SDimitry Andric /// \endcode
74906c3fb27SDimitry Andric ///
75006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
75106c3fb27SDimitry Andric ///
75206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTW instruction.
75306c3fb27SDimitry Andric ///
75406c3fb27SDimitry Andric /// \param __a
75506c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the inputs.
75606c3fb27SDimitry Andric /// \param __b
75706c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the inputs.
75806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
7590b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
7600b57cec5SDimitry Andric _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
7610b57cec5SDimitry Andric {
7620b57cec5SDimitry Andric   return (__m256i)((__v16hi)__a > (__v16hi)__b);
7630b57cec5SDimitry Andric }
7640b57cec5SDimitry Andric 
76506c3fb27SDimitry Andric /// Compares corresponding signed elements in the 256-bit vectors of
76606c3fb27SDimitry Andric ///    [8 x i32] in \a __a and \a __b for greater-than and returns the
76706c3fb27SDimitry Andric ///    outcomes in the corresponding elements of the 256-bit result.
76806c3fb27SDimitry Andric ///
76906c3fb27SDimitry Andric /// \code{.operation}
77006c3fb27SDimitry Andric /// FOR i := 0 TO 7
77106c3fb27SDimitry Andric ///   j := i*32
77206c3fb27SDimitry Andric ///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
77306c3fb27SDimitry Andric /// ENDFOR
77406c3fb27SDimitry Andric /// \endcode
77506c3fb27SDimitry Andric ///
77606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
77706c3fb27SDimitry Andric ///
77806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTD instruction.
77906c3fb27SDimitry Andric ///
78006c3fb27SDimitry Andric /// \param __a
78106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the inputs.
78206c3fb27SDimitry Andric /// \param __b
78306c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the inputs.
78406c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
7850b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
7860b57cec5SDimitry Andric _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
7870b57cec5SDimitry Andric {
7880b57cec5SDimitry Andric   return (__m256i)((__v8si)__a > (__v8si)__b);
7890b57cec5SDimitry Andric }
7900b57cec5SDimitry Andric 
79106c3fb27SDimitry Andric /// Compares corresponding signed elements in the 256-bit vectors of
79206c3fb27SDimitry Andric ///    [4 x i64] in \a __a and \a __b for greater-than and returns the
79306c3fb27SDimitry Andric ///    outcomes in the corresponding elements of the 256-bit result.
79406c3fb27SDimitry Andric ///
79506c3fb27SDimitry Andric /// \code{.operation}
79606c3fb27SDimitry Andric /// FOR i := 0 TO 3
79706c3fb27SDimitry Andric ///   j := i*64
79806c3fb27SDimitry Andric ///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
79906c3fb27SDimitry Andric /// ENDFOR
80006c3fb27SDimitry Andric /// \endcode
80106c3fb27SDimitry Andric ///
80206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
80306c3fb27SDimitry Andric ///
80406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
80506c3fb27SDimitry Andric ///
80606c3fb27SDimitry Andric /// \param __a
80706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing one of the inputs.
80806c3fb27SDimitry Andric /// \param __b
80906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing one of the inputs.
81006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
8110b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
8120b57cec5SDimitry Andric _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
8130b57cec5SDimitry Andric {
8140b57cec5SDimitry Andric   return (__m256i)((__v4di)__a > (__v4di)__b);
8150b57cec5SDimitry Andric }
8160b57cec5SDimitry Andric 
81706c3fb27SDimitry Andric /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
81806c3fb27SDimitry Andric ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
81906c3fb27SDimitry Andric ///    element of the [16 x i16] result (overflow is ignored). Sums from
82006c3fb27SDimitry Andric ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
82106c3fb27SDimitry Andric ///    result; sums from \a __b are returned in the upper 64 bits of each
82206c3fb27SDimitry Andric ///    128-bit half of the result.
82306c3fb27SDimitry Andric ///
82406c3fb27SDimitry Andric /// \code{.operation}
82506c3fb27SDimitry Andric /// FOR i := 0 TO 1
82606c3fb27SDimitry Andric ///   j := i*128
82706c3fb27SDimitry Andric ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
82806c3fb27SDimitry Andric ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
82906c3fb27SDimitry Andric ///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
83006c3fb27SDimitry Andric ///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
83106c3fb27SDimitry Andric ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
83206c3fb27SDimitry Andric ///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
83306c3fb27SDimitry Andric ///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
83406c3fb27SDimitry Andric ///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
83506c3fb27SDimitry Andric /// ENDFOR
83606c3fb27SDimitry Andric /// \endcode
83706c3fb27SDimitry Andric ///
83806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
83906c3fb27SDimitry Andric ///
84006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHADDW instruction.
84106c3fb27SDimitry Andric ///
84206c3fb27SDimitry Andric /// \param __a
84306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
84406c3fb27SDimitry Andric /// \param __b
84506c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
84606c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums.
8470b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
8480b57cec5SDimitry Andric _mm256_hadd_epi16(__m256i __a, __m256i __b)
8490b57cec5SDimitry Andric {
8500b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
8510b57cec5SDimitry Andric }
8520b57cec5SDimitry Andric 
85306c3fb27SDimitry Andric /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
85406c3fb27SDimitry Andric ///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
85506c3fb27SDimitry Andric ///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
85606c3fb27SDimitry Andric ///    are returned in the lower 64 bits of each 128-bit half of the result;
85706c3fb27SDimitry Andric ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
85806c3fb27SDimitry Andric ///    of the result.
85906c3fb27SDimitry Andric ///
86006c3fb27SDimitry Andric /// \code{.operation}
86106c3fb27SDimitry Andric /// FOR i := 0 TO 1
86206c3fb27SDimitry Andric ///   j := i*128
86306c3fb27SDimitry Andric ///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
86406c3fb27SDimitry Andric ///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
86506c3fb27SDimitry Andric ///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
86606c3fb27SDimitry Andric ///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
86706c3fb27SDimitry Andric /// ENDFOR
86806c3fb27SDimitry Andric /// \endcode
86906c3fb27SDimitry Andric ///
87006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
87106c3fb27SDimitry Andric ///
87206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHADDD instruction.
87306c3fb27SDimitry Andric ///
87406c3fb27SDimitry Andric /// \param __a
87506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
87606c3fb27SDimitry Andric /// \param __b
87706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
87806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sums.
8790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
8800b57cec5SDimitry Andric _mm256_hadd_epi32(__m256i __a, __m256i __b)
8810b57cec5SDimitry Andric {
8820b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
8830b57cec5SDimitry Andric }
8840b57cec5SDimitry Andric 
88506c3fb27SDimitry Andric /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
88606c3fb27SDimitry Andric ///    vectors of [16 x i16] using signed saturation and returns each sum in
88706c3fb27SDimitry Andric ///    an element of the [16 x i16] result. Sums from \a __a are returned in
88806c3fb27SDimitry Andric ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
88906c3fb27SDimitry Andric ///    are returned in the upper 64 bits of each 128-bit half of the result.
89006c3fb27SDimitry Andric ///
89106c3fb27SDimitry Andric /// \code{.operation}
89206c3fb27SDimitry Andric /// FOR i := 0 TO 1
89306c3fb27SDimitry Andric ///   j := i*128
89406c3fb27SDimitry Andric ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
89506c3fb27SDimitry Andric ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
89606c3fb27SDimitry Andric ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
89706c3fb27SDimitry Andric ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
89806c3fb27SDimitry Andric ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
89906c3fb27SDimitry Andric ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
90006c3fb27SDimitry Andric ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
90106c3fb27SDimitry Andric ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
90206c3fb27SDimitry Andric /// ENDFOR
90306c3fb27SDimitry Andric /// \endcode
90406c3fb27SDimitry Andric ///
90506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
90606c3fb27SDimitry Andric ///
90706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHADDSW instruction.
90806c3fb27SDimitry Andric ///
90906c3fb27SDimitry Andric /// \param __a
91006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
91106c3fb27SDimitry Andric /// \param __b
91206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
91306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums.
9140b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
9150b57cec5SDimitry Andric _mm256_hadds_epi16(__m256i __a, __m256i __b)
9160b57cec5SDimitry Andric {
9170b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
9180b57cec5SDimitry Andric }
9190b57cec5SDimitry Andric 
92006c3fb27SDimitry Andric /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
92106c3fb27SDimitry Andric ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
92206c3fb27SDimitry Andric ///    in an element of the [16 x i16] result (overflow is ignored).
92306c3fb27SDimitry Andric ///    Differences from \a __a are returned in the lower 64 bits of each
92406c3fb27SDimitry Andric ///    128-bit half of the result; differences from \a __b are returned in the
92506c3fb27SDimitry Andric ///    upper 64 bits of each 128-bit half of the result.
92606c3fb27SDimitry Andric ///
92706c3fb27SDimitry Andric /// \code{.operation}
92806c3fb27SDimitry Andric /// FOR i := 0 TO 1
92906c3fb27SDimitry Andric ///   j := i*128
93006c3fb27SDimitry Andric ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
93106c3fb27SDimitry Andric ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
93206c3fb27SDimitry Andric ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
93306c3fb27SDimitry Andric ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
93406c3fb27SDimitry Andric ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
93506c3fb27SDimitry Andric ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
93606c3fb27SDimitry Andric ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
93706c3fb27SDimitry Andric ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
93806c3fb27SDimitry Andric /// ENDFOR
93906c3fb27SDimitry Andric /// \endcode
94006c3fb27SDimitry Andric ///
94106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
94206c3fb27SDimitry Andric ///
94306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHSUBW instruction.
94406c3fb27SDimitry Andric ///
94506c3fb27SDimitry Andric /// \param __a
94606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
94706c3fb27SDimitry Andric /// \param __b
94806c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
94906c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences.
9500b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
9510b57cec5SDimitry Andric _mm256_hsub_epi16(__m256i __a, __m256i __b)
9520b57cec5SDimitry Andric {
9530b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
9540b57cec5SDimitry Andric }
9550b57cec5SDimitry Andric 
95606c3fb27SDimitry Andric /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
95706c3fb27SDimitry Andric ///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
95806c3fb27SDimitry Andric ///    an element of the [8 x i32] result (overflow is ignored). Differences
95906c3fb27SDimitry Andric ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
96006c3fb27SDimitry Andric ///    the result; differences from \a __b are returned in the upper 64 bits
96106c3fb27SDimitry Andric ///    of each 128-bit half of the result.
96206c3fb27SDimitry Andric ///
96306c3fb27SDimitry Andric /// \code{.operation}
96406c3fb27SDimitry Andric /// FOR i := 0 TO 1
96506c3fb27SDimitry Andric ///   j := i*128
96606c3fb27SDimitry Andric ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
96706c3fb27SDimitry Andric ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
96806c3fb27SDimitry Andric ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
96906c3fb27SDimitry Andric ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
97006c3fb27SDimitry Andric /// ENDFOR
97106c3fb27SDimitry Andric /// \endcode
97206c3fb27SDimitry Andric ///
97306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
97406c3fb27SDimitry Andric ///
97506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHSUBD instruction.
97606c3fb27SDimitry Andric ///
97706c3fb27SDimitry Andric /// \param __a
97806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
97906c3fb27SDimitry Andric /// \param __b
98006c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
98106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the differences.
9820b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
9830b57cec5SDimitry Andric _mm256_hsub_epi32(__m256i __a, __m256i __b)
9840b57cec5SDimitry Andric {
9850b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
9860b57cec5SDimitry Andric }
9870b57cec5SDimitry Andric 
98806c3fb27SDimitry Andric /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
98906c3fb27SDimitry Andric ///    vectors of [16 x i16] using signed saturation and returns each sum in
99006c3fb27SDimitry Andric ///    an element of the [16 x i16] result. Differences from \a __a are
99106c3fb27SDimitry Andric ///    returned in the lower 64 bits of each 128-bit half of the result;
99206c3fb27SDimitry Andric ///    differences from \a __b are returned in the upper 64 bits of each
99306c3fb27SDimitry Andric ///    128-bit half of the result.
99406c3fb27SDimitry Andric ///
99506c3fb27SDimitry Andric /// \code{.operation}
99606c3fb27SDimitry Andric /// FOR i := 0 TO 1
99706c3fb27SDimitry Andric ///   j := i*128
99806c3fb27SDimitry Andric ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
99906c3fb27SDimitry Andric ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
100006c3fb27SDimitry Andric ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
100106c3fb27SDimitry Andric ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
100206c3fb27SDimitry Andric ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
100306c3fb27SDimitry Andric ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
100406c3fb27SDimitry Andric ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
100506c3fb27SDimitry Andric ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
100606c3fb27SDimitry Andric /// ENDFOR
100706c3fb27SDimitry Andric /// \endcode
100806c3fb27SDimitry Andric ///
100906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
101006c3fb27SDimitry Andric ///
101106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHSUBSW instruction.
101206c3fb27SDimitry Andric ///
101306c3fb27SDimitry Andric /// \param __a
101406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
101506c3fb27SDimitry Andric /// \param __b
101606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
101706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences.
10180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
10190b57cec5SDimitry Andric _mm256_hsubs_epi16(__m256i __a, __m256i __b)
10200b57cec5SDimitry Andric {
10210b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
10220b57cec5SDimitry Andric }
10230b57cec5SDimitry Andric 
102406c3fb27SDimitry Andric /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
102506c3fb27SDimitry Andric ///    with the corresponding signed byte from the 256-bit integer vector in
102606c3fb27SDimitry Andric ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
102706c3fb27SDimitry Andric ///    pairs of those products using signed saturation to form 16-bit sums
102806c3fb27SDimitry Andric ///    returned as elements of the [16 x i16] result.
102906c3fb27SDimitry Andric ///
103006c3fb27SDimitry Andric /// \code{.operation}
103106c3fb27SDimitry Andric /// FOR i := 0 TO 15
103206c3fb27SDimitry Andric ///   j := i*16
103306c3fb27SDimitry Andric ///   temp1 := __a[j+7:j] * __b[j+7:j]
103406c3fb27SDimitry Andric ///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
103506c3fb27SDimitry Andric ///   result[j+15:j] := SATURATE16(temp1 + temp2)
103606c3fb27SDimitry Andric /// ENDFOR
103706c3fb27SDimitry Andric /// \endcode
103806c3fb27SDimitry Andric ///
103906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
104006c3fb27SDimitry Andric ///
104106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
104206c3fb27SDimitry Andric ///
104306c3fb27SDimitry Andric /// \param __a
104406c3fb27SDimitry Andric ///    A 256-bit vector containing one of the source operands.
104506c3fb27SDimitry Andric /// \param __b
104606c3fb27SDimitry Andric ///    A 256-bit vector containing one of the source operands.
104706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
10480b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
10490b57cec5SDimitry Andric _mm256_maddubs_epi16(__m256i __a, __m256i __b)
10500b57cec5SDimitry Andric {
10510b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
10520b57cec5SDimitry Andric }
10530b57cec5SDimitry Andric 
105406c3fb27SDimitry Andric /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
105506c3fb27SDimitry Andric ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
105606c3fb27SDimitry Andric ///    those products to form 32-bit sums returned as elements of the
105706c3fb27SDimitry Andric ///    [8 x i32] result.
105806c3fb27SDimitry Andric ///
105906c3fb27SDimitry Andric ///    There is only one wraparound case: when all four of the 16-bit sources
106006c3fb27SDimitry Andric ///    are \c 0x8000, the result will be \c 0x80000000.
106106c3fb27SDimitry Andric ///
106206c3fb27SDimitry Andric /// \code{.operation}
106306c3fb27SDimitry Andric /// FOR i := 0 TO 7
106406c3fb27SDimitry Andric ///   j := i*32
106506c3fb27SDimitry Andric ///   temp1 := __a[j+15:j] * __b[j+15:j]
106606c3fb27SDimitry Andric ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
106706c3fb27SDimitry Andric ///   result[j+31:j] := temp1 + temp2
106806c3fb27SDimitry Andric /// ENDFOR
106906c3fb27SDimitry Andric /// \endcode
107006c3fb27SDimitry Andric ///
107106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
107206c3fb27SDimitry Andric ///
107306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMADDWD instruction.
107406c3fb27SDimitry Andric ///
107506c3fb27SDimitry Andric /// \param __a
107606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
107706c3fb27SDimitry Andric /// \param __b
107806c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
107906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
10800b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
10810b57cec5SDimitry Andric _mm256_madd_epi16(__m256i __a, __m256i __b)
10820b57cec5SDimitry Andric {
10830b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
10840b57cec5SDimitry Andric }
10850b57cec5SDimitry Andric 
108606c3fb27SDimitry Andric /// Compares the corresponding signed bytes in the two 256-bit integer vectors
108706c3fb27SDimitry Andric ///     in \a __a and \a __b and returns the larger of each pair in the
108806c3fb27SDimitry Andric ///     corresponding byte of the 256-bit result.
108906c3fb27SDimitry Andric ///
109006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
109106c3fb27SDimitry Andric ///
109206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXSB instruction.
109306c3fb27SDimitry Andric ///
109406c3fb27SDimitry Andric /// \param __a
109506c3fb27SDimitry Andric ///    A 256-bit integer vector.
109606c3fb27SDimitry Andric /// \param __b
109706c3fb27SDimitry Andric ///    A 256-bit integer vector.
109806c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
10990b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
11000b57cec5SDimitry Andric _mm256_max_epi8(__m256i __a, __m256i __b)
11010b57cec5SDimitry Andric {
110204eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
11030b57cec5SDimitry Andric }
11040b57cec5SDimitry Andric 
110506c3fb27SDimitry Andric /// Compares the corresponding signed 16-bit integers in the two 256-bit
110606c3fb27SDimitry Andric ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
110706c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
110806c3fb27SDimitry Andric ///
110906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
111006c3fb27SDimitry Andric ///
111106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXSW instruction.
111206c3fb27SDimitry Andric ///
111306c3fb27SDimitry Andric /// \param __a
111406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
111506c3fb27SDimitry Andric /// \param __b
111606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
111706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
11180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
11190b57cec5SDimitry Andric _mm256_max_epi16(__m256i __a, __m256i __b)
11200b57cec5SDimitry Andric {
112104eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
11220b57cec5SDimitry Andric }
11230b57cec5SDimitry Andric 
112406c3fb27SDimitry Andric /// Compares the corresponding signed 32-bit integers in the two 256-bit
112506c3fb27SDimitry Andric ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
112606c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
112706c3fb27SDimitry Andric ///
112806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
112906c3fb27SDimitry Andric ///
113006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXSD instruction.
113106c3fb27SDimitry Andric ///
113206c3fb27SDimitry Andric /// \param __a
113306c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
113406c3fb27SDimitry Andric /// \param __b
113506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
113606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
11370b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
11380b57cec5SDimitry Andric _mm256_max_epi32(__m256i __a, __m256i __b)
11390b57cec5SDimitry Andric {
114004eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
11410b57cec5SDimitry Andric }
11420b57cec5SDimitry Andric 
114306c3fb27SDimitry Andric /// Compares the corresponding unsigned bytes in the two 256-bit integer
114406c3fb27SDimitry Andric ///     vectors in \a __a and \a __b and returns the larger of each pair in
114506c3fb27SDimitry Andric ///     the corresponding byte of the 256-bit result.
114606c3fb27SDimitry Andric ///
114706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
114806c3fb27SDimitry Andric ///
114906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXUB instruction.
115006c3fb27SDimitry Andric ///
115106c3fb27SDimitry Andric /// \param __a
115206c3fb27SDimitry Andric ///    A 256-bit integer vector.
115306c3fb27SDimitry Andric /// \param __b
115406c3fb27SDimitry Andric ///    A 256-bit integer vector.
115506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
11560b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
11570b57cec5SDimitry Andric _mm256_max_epu8(__m256i __a, __m256i __b)
11580b57cec5SDimitry Andric {
115904eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
11600b57cec5SDimitry Andric }
11610b57cec5SDimitry Andric 
116206c3fb27SDimitry Andric /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
116306c3fb27SDimitry Andric ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
116406c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
116506c3fb27SDimitry Andric ///
116606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
116706c3fb27SDimitry Andric ///
116806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXUW instruction.
116906c3fb27SDimitry Andric ///
117006c3fb27SDimitry Andric /// \param __a
117106c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
117206c3fb27SDimitry Andric /// \param __b
117306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
117406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
11750b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
11760b57cec5SDimitry Andric _mm256_max_epu16(__m256i __a, __m256i __b)
11770b57cec5SDimitry Andric {
117804eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
11790b57cec5SDimitry Andric }
11800b57cec5SDimitry Andric 
118106c3fb27SDimitry Andric /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
118206c3fb27SDimitry Andric ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
118306c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
118406c3fb27SDimitry Andric ///
118506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
118606c3fb27SDimitry Andric ///
118706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXUD instruction.
118806c3fb27SDimitry Andric ///
118906c3fb27SDimitry Andric /// \param __a
119006c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
119106c3fb27SDimitry Andric /// \param __b
119206c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
119306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
11940b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
11950b57cec5SDimitry Andric _mm256_max_epu32(__m256i __a, __m256i __b)
11960b57cec5SDimitry Andric {
119704eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
11980b57cec5SDimitry Andric }
11990b57cec5SDimitry Andric 
120006c3fb27SDimitry Andric /// Compares the corresponding signed bytes in the two 256-bit integer vectors
120106c3fb27SDimitry Andric ///     in \a __a and \a __b and returns the smaller of each pair in the
120206c3fb27SDimitry Andric ///     corresponding byte of the 256-bit result.
120306c3fb27SDimitry Andric ///
120406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
120506c3fb27SDimitry Andric ///
120606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINSB instruction.
120706c3fb27SDimitry Andric ///
120806c3fb27SDimitry Andric /// \param __a
120906c3fb27SDimitry Andric ///    A 256-bit integer vector.
121006c3fb27SDimitry Andric /// \param __b
121106c3fb27SDimitry Andric ///    A 256-bit integer vector.
121206c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
12130b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
12140b57cec5SDimitry Andric _mm256_min_epi8(__m256i __a, __m256i __b)
12150b57cec5SDimitry Andric {
121604eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
12170b57cec5SDimitry Andric }
12180b57cec5SDimitry Andric 
121906c3fb27SDimitry Andric /// Compares the corresponding signed 16-bit integers in the two 256-bit
122006c3fb27SDimitry Andric ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
122106c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
122206c3fb27SDimitry Andric ///
122306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
122406c3fb27SDimitry Andric ///
122506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINSW instruction.
122606c3fb27SDimitry Andric ///
122706c3fb27SDimitry Andric /// \param __a
122806c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
122906c3fb27SDimitry Andric /// \param __b
123006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
123106c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
12320b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
12330b57cec5SDimitry Andric _mm256_min_epi16(__m256i __a, __m256i __b)
12340b57cec5SDimitry Andric {
123504eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
12360b57cec5SDimitry Andric }
12370b57cec5SDimitry Andric 
123806c3fb27SDimitry Andric /// Compares the corresponding signed 32-bit integers in the two 256-bit
123906c3fb27SDimitry Andric ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
124006c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
124106c3fb27SDimitry Andric ///
124206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
124306c3fb27SDimitry Andric ///
124406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINSD instruction.
124506c3fb27SDimitry Andric ///
124606c3fb27SDimitry Andric /// \param __a
124706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
124806c3fb27SDimitry Andric /// \param __b
124906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
125006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
12510b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
12520b57cec5SDimitry Andric _mm256_min_epi32(__m256i __a, __m256i __b)
12530b57cec5SDimitry Andric {
125404eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
12550b57cec5SDimitry Andric }
12560b57cec5SDimitry Andric 
125706c3fb27SDimitry Andric /// Compares the corresponding unsigned bytes in the two 256-bit integer
125806c3fb27SDimitry Andric ///     vectors in \a __a and \a __b and returns the smaller of each pair in
125906c3fb27SDimitry Andric ///     the corresponding byte of the 256-bit result.
126006c3fb27SDimitry Andric ///
126106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
126206c3fb27SDimitry Andric ///
126306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINUB instruction.
126406c3fb27SDimitry Andric ///
126506c3fb27SDimitry Andric /// \param __a
126606c3fb27SDimitry Andric ///    A 256-bit integer vector.
126706c3fb27SDimitry Andric /// \param __b
126806c3fb27SDimitry Andric ///    A 256-bit integer vector.
126906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
12700b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
12710b57cec5SDimitry Andric _mm256_min_epu8(__m256i __a, __m256i __b)
12720b57cec5SDimitry Andric {
127304eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
12740b57cec5SDimitry Andric }
12750b57cec5SDimitry Andric 
127606c3fb27SDimitry Andric /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
127706c3fb27SDimitry Andric ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
127806c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
127906c3fb27SDimitry Andric ///
128006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
128106c3fb27SDimitry Andric ///
128206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINUW instruction.
128306c3fb27SDimitry Andric ///
128406c3fb27SDimitry Andric /// \param __a
128506c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
128606c3fb27SDimitry Andric /// \param __b
128706c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
128806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
12890b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
12900b57cec5SDimitry Andric _mm256_min_epu16(__m256i __a, __m256i __b)
12910b57cec5SDimitry Andric {
129204eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
12930b57cec5SDimitry Andric }
12940b57cec5SDimitry Andric 
129506c3fb27SDimitry Andric /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
129606c3fb27SDimitry Andric ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
129706c3fb27SDimitry Andric ///    each pair in the corresponding element of the 256-bit result.
129806c3fb27SDimitry Andric ///
129906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
130006c3fb27SDimitry Andric ///
130106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINUD instruction.
130206c3fb27SDimitry Andric ///
130306c3fb27SDimitry Andric /// \param __a
130406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
130506c3fb27SDimitry Andric /// \param __b
130606c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
130706c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
13080b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
13090b57cec5SDimitry Andric _mm256_min_epu32(__m256i __a, __m256i __b)
13100b57cec5SDimitry Andric {
131104eeddc0SDimitry Andric   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
13120b57cec5SDimitry Andric }
13130b57cec5SDimitry Andric 
1314*5f757f3fSDimitry Andric /// Creates a 32-bit integer mask from the most significant bit of each byte
1315*5f757f3fSDimitry Andric ///    in the 256-bit integer vector in \a __a and returns the result.
1316*5f757f3fSDimitry Andric ///
1317*5f757f3fSDimitry Andric /// \code{.operation}
1318*5f757f3fSDimitry Andric /// FOR i := 0 TO 31
1319*5f757f3fSDimitry Andric ///   j := i*8
1320*5f757f3fSDimitry Andric ///   result[i] := __a[j+7]
1321*5f757f3fSDimitry Andric /// ENDFOR
1322*5f757f3fSDimitry Andric /// \endcode
1323*5f757f3fSDimitry Andric ///
1324*5f757f3fSDimitry Andric /// \headerfile <immintrin.h>
1325*5f757f3fSDimitry Andric ///
1326*5f757f3fSDimitry Andric /// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1327*5f757f3fSDimitry Andric ///
1328*5f757f3fSDimitry Andric /// \param __a
1329*5f757f3fSDimitry Andric ///    A 256-bit integer vector containing the source bytes.
1330*5f757f3fSDimitry Andric /// \returns The 32-bit integer mask.
13310b57cec5SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS256
13320b57cec5SDimitry Andric _mm256_movemask_epi8(__m256i __a)
13330b57cec5SDimitry Andric {
13340b57cec5SDimitry Andric   return __builtin_ia32_pmovmskb256((__v32qi)__a);
13350b57cec5SDimitry Andric }
13360b57cec5SDimitry Andric 
133706c3fb27SDimitry Andric /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
133806c3fb27SDimitry Andric ///    the 16-bit values in the corresponding elements of a 256-bit vector
133906c3fb27SDimitry Andric ///    of [16 x i16].
134006c3fb27SDimitry Andric ///
134106c3fb27SDimitry Andric /// \code{.operation}
134206c3fb27SDimitry Andric /// FOR i := 0 TO 15
134306c3fb27SDimitry Andric ///   j := i*8
134406c3fb27SDimitry Andric ///   k := i*16
134506c3fb27SDimitry Andric ///   result[k+15:k] := SignExtend(__V[j+7:j])
134606c3fb27SDimitry Andric /// ENDFOR
134706c3fb27SDimitry Andric /// \endcode
134806c3fb27SDimitry Andric ///
134906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
135006c3fb27SDimitry Andric ///
135106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
135206c3fb27SDimitry Andric ///
135306c3fb27SDimitry Andric /// \param __V
135406c3fb27SDimitry Andric ///    A 128-bit integer vector containing the source bytes.
135506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
135606c3fb27SDimitry Andric ///    values.
13570b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
13580b57cec5SDimitry Andric _mm256_cvtepi8_epi16(__m128i __V)
13590b57cec5SDimitry Andric {
13600b57cec5SDimitry Andric   /* This function always performs a signed extension, but __v16qi is a char
13610b57cec5SDimitry Andric      which may be signed or unsigned, so use __v16qs. */
13620b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
13630b57cec5SDimitry Andric }
13640b57cec5SDimitry Andric 
136506c3fb27SDimitry Andric /// Sign-extends bytes from the lower half of the 128-bit integer vector in
136606c3fb27SDimitry Andric ///    \a __V and returns the 32-bit values in the corresponding elements of a
136706c3fb27SDimitry Andric ///    256-bit vector of [8 x i32].
136806c3fb27SDimitry Andric ///
136906c3fb27SDimitry Andric /// \code{.operation}
137006c3fb27SDimitry Andric /// FOR i := 0 TO 7
137106c3fb27SDimitry Andric ///   j := i*8
137206c3fb27SDimitry Andric ///   k := i*32
137306c3fb27SDimitry Andric ///   result[k+31:k] := SignExtend(__V[j+7:j])
137406c3fb27SDimitry Andric /// ENDFOR
137506c3fb27SDimitry Andric /// \endcode
137606c3fb27SDimitry Andric ///
137706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
137806c3fb27SDimitry Andric ///
137906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
138006c3fb27SDimitry Andric ///
138106c3fb27SDimitry Andric /// \param __V
138206c3fb27SDimitry Andric ///    A 128-bit integer vector containing the source bytes.
138306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
138406c3fb27SDimitry Andric ///    values.
13850b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
13860b57cec5SDimitry Andric _mm256_cvtepi8_epi32(__m128i __V)
13870b57cec5SDimitry Andric {
13880b57cec5SDimitry Andric   /* This function always performs a signed extension, but __v16qi is a char
13890b57cec5SDimitry Andric      which may be signed or unsigned, so use __v16qs. */
13900b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
13910b57cec5SDimitry Andric }
13920b57cec5SDimitry Andric 
139306c3fb27SDimitry Andric /// Sign-extends the first four bytes from the 128-bit integer vector in
139406c3fb27SDimitry Andric ///    \a __V and returns the 64-bit values in the corresponding elements of a
139506c3fb27SDimitry Andric ///    256-bit vector of [4 x i64].
139606c3fb27SDimitry Andric ///
139706c3fb27SDimitry Andric /// \code{.operation}
139806c3fb27SDimitry Andric /// result[63:0] := SignExtend(__V[7:0])
139906c3fb27SDimitry Andric /// result[127:64] := SignExtend(__V[15:8])
140006c3fb27SDimitry Andric /// result[191:128] := SignExtend(__V[23:16])
140106c3fb27SDimitry Andric /// result[255:192] := SignExtend(__V[31:24])
140206c3fb27SDimitry Andric /// \endcode
140306c3fb27SDimitry Andric ///
140406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
140506c3fb27SDimitry Andric ///
140606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
140706c3fb27SDimitry Andric ///
140806c3fb27SDimitry Andric /// \param __V
140906c3fb27SDimitry Andric ///    A 128-bit integer vector containing the source bytes.
141006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
141106c3fb27SDimitry Andric ///    values.
14120b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
14130b57cec5SDimitry Andric _mm256_cvtepi8_epi64(__m128i __V)
14140b57cec5SDimitry Andric {
14150b57cec5SDimitry Andric   /* This function always performs a signed extension, but __v16qi is a char
14160b57cec5SDimitry Andric      which may be signed or unsigned, so use __v16qs. */
14170b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
14180b57cec5SDimitry Andric }
14190b57cec5SDimitry Andric 
142006c3fb27SDimitry Andric /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
142106c3fb27SDimitry Andric ///    \a __V and returns the 32-bit values in the corresponding elements of a
142206c3fb27SDimitry Andric ///    256-bit vector of [8 x i32].
142306c3fb27SDimitry Andric ///
142406c3fb27SDimitry Andric /// \code{.operation}
142506c3fb27SDimitry Andric /// FOR i := 0 TO 7
142606c3fb27SDimitry Andric ///   j := i*16
142706c3fb27SDimitry Andric ///   k := i*32
142806c3fb27SDimitry Andric ///   result[k+31:k] := SignExtend(__V[j+15:j])
142906c3fb27SDimitry Andric /// ENDFOR
143006c3fb27SDimitry Andric /// \endcode
143106c3fb27SDimitry Andric ///
143206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
143306c3fb27SDimitry Andric ///
143406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
143506c3fb27SDimitry Andric ///
143606c3fb27SDimitry Andric /// \param __V
143706c3fb27SDimitry Andric ///    A 128-bit vector of [8 x i16] containing the source values.
143806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
143906c3fb27SDimitry Andric ///    values.
14400b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
14410b57cec5SDimitry Andric _mm256_cvtepi16_epi32(__m128i __V)
14420b57cec5SDimitry Andric {
14430b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
14440b57cec5SDimitry Andric }
14450b57cec5SDimitry Andric 
144606c3fb27SDimitry Andric /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
144706c3fb27SDimitry Andric ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
144806c3fb27SDimitry Andric ///    elements of a 256-bit vector of [4 x i64].
144906c3fb27SDimitry Andric ///
145006c3fb27SDimitry Andric /// \code{.operation}
145106c3fb27SDimitry Andric /// result[63:0] := SignExtend(__V[15:0])
145206c3fb27SDimitry Andric /// result[127:64] := SignExtend(__V[31:16])
145306c3fb27SDimitry Andric /// result[191:128] := SignExtend(__V[47:32])
145406c3fb27SDimitry Andric /// result[255:192] := SignExtend(__V[64:48])
145506c3fb27SDimitry Andric /// \endcode
145606c3fb27SDimitry Andric ///
145706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
145806c3fb27SDimitry Andric ///
145906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
146006c3fb27SDimitry Andric ///
146106c3fb27SDimitry Andric /// \param __V
146206c3fb27SDimitry Andric ///    A 128-bit vector of [8 x i16] containing the source values.
146306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
146406c3fb27SDimitry Andric ///    values.
14650b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
14660b57cec5SDimitry Andric _mm256_cvtepi16_epi64(__m128i __V)
14670b57cec5SDimitry Andric {
14680b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
14690b57cec5SDimitry Andric }
14700b57cec5SDimitry Andric 
147106c3fb27SDimitry Andric /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
147206c3fb27SDimitry Andric ///    \a __V and returns the 64-bit values in the corresponding elements of a
147306c3fb27SDimitry Andric ///    256-bit vector of [4 x i64].
147406c3fb27SDimitry Andric ///
147506c3fb27SDimitry Andric /// \code{.operation}
147606c3fb27SDimitry Andric /// result[63:0] := SignExtend(__V[31:0])
147706c3fb27SDimitry Andric /// result[127:64] := SignExtend(__V[63:32])
147806c3fb27SDimitry Andric /// result[191:128] := SignExtend(__V[95:64])
147906c3fb27SDimitry Andric /// result[255:192] := SignExtend(__V[127:96])
148006c3fb27SDimitry Andric /// \endcode
148106c3fb27SDimitry Andric ///
148206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
148306c3fb27SDimitry Andric ///
148406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
148506c3fb27SDimitry Andric ///
148606c3fb27SDimitry Andric /// \param __V
148706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the source values.
148806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
148906c3fb27SDimitry Andric ///    values.
14900b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
14910b57cec5SDimitry Andric _mm256_cvtepi32_epi64(__m128i __V)
14920b57cec5SDimitry Andric {
14930b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
14940b57cec5SDimitry Andric }
14950b57cec5SDimitry Andric 
149606c3fb27SDimitry Andric /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
149706c3fb27SDimitry Andric ///    the 16-bit values in the corresponding elements of a 256-bit vector
149806c3fb27SDimitry Andric ///    of [16 x i16].
149906c3fb27SDimitry Andric ///
150006c3fb27SDimitry Andric /// \code{.operation}
150106c3fb27SDimitry Andric /// FOR i := 0 TO 15
150206c3fb27SDimitry Andric ///   j := i*8
150306c3fb27SDimitry Andric ///   k := i*16
150406c3fb27SDimitry Andric ///   result[k+15:k] := ZeroExtend(__V[j+7:j])
150506c3fb27SDimitry Andric /// ENDFOR
150606c3fb27SDimitry Andric /// \endcode
150706c3fb27SDimitry Andric ///
150806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
150906c3fb27SDimitry Andric ///
151006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
151106c3fb27SDimitry Andric ///
151206c3fb27SDimitry Andric /// \param __V
151306c3fb27SDimitry Andric ///    A 128-bit integer vector containing the source bytes.
151406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
151506c3fb27SDimitry Andric ///    values.
15160b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
15170b57cec5SDimitry Andric _mm256_cvtepu8_epi16(__m128i __V)
15180b57cec5SDimitry Andric {
15190b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
15200b57cec5SDimitry Andric }
15210b57cec5SDimitry Andric 
152206c3fb27SDimitry Andric /// Zero-extends bytes from the lower half of the 128-bit integer vector in
152306c3fb27SDimitry Andric ///    \a __V and returns the 32-bit values in the corresponding elements of a
152406c3fb27SDimitry Andric ///    256-bit vector of [8 x i32].
152506c3fb27SDimitry Andric ///
152606c3fb27SDimitry Andric /// \code{.operation}
152706c3fb27SDimitry Andric /// FOR i := 0 TO 7
152806c3fb27SDimitry Andric ///   j := i*8
152906c3fb27SDimitry Andric ///   k := i*32
153006c3fb27SDimitry Andric ///   result[k+31:k] := ZeroExtend(__V[j+7:j])
153106c3fb27SDimitry Andric /// ENDFOR
153206c3fb27SDimitry Andric /// \endcode
153306c3fb27SDimitry Andric ///
153406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
153506c3fb27SDimitry Andric ///
153606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
153706c3fb27SDimitry Andric ///
153806c3fb27SDimitry Andric /// \param __V
153906c3fb27SDimitry Andric ///    A 128-bit integer vector containing the source bytes.
154006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
154106c3fb27SDimitry Andric ///    values.
15420b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
15430b57cec5SDimitry Andric _mm256_cvtepu8_epi32(__m128i __V)
15440b57cec5SDimitry Andric {
15450b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
15460b57cec5SDimitry Andric }
15470b57cec5SDimitry Andric 
154806c3fb27SDimitry Andric /// Zero-extends the first four bytes from the 128-bit integer vector in
154906c3fb27SDimitry Andric ///    \a __V and returns the 64-bit values in the corresponding elements of a
155006c3fb27SDimitry Andric ///    256-bit vector of [4 x i64].
155106c3fb27SDimitry Andric ///
155206c3fb27SDimitry Andric /// \code{.operation}
155306c3fb27SDimitry Andric /// result[63:0] := ZeroExtend(__V[7:0])
155406c3fb27SDimitry Andric /// result[127:64] := ZeroExtend(__V[15:8])
155506c3fb27SDimitry Andric /// result[191:128] := ZeroExtend(__V[23:16])
155606c3fb27SDimitry Andric /// result[255:192] := ZeroExtend(__V[31:24])
155706c3fb27SDimitry Andric /// \endcode
155806c3fb27SDimitry Andric ///
155906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
156006c3fb27SDimitry Andric ///
156106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
156206c3fb27SDimitry Andric ///
156306c3fb27SDimitry Andric /// \param __V
156406c3fb27SDimitry Andric ///    A 128-bit integer vector containing the source bytes.
156506c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
156606c3fb27SDimitry Andric ///    values.
15670b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
15680b57cec5SDimitry Andric _mm256_cvtepu8_epi64(__m128i __V)
15690b57cec5SDimitry Andric {
15700b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
15710b57cec5SDimitry Andric }
15720b57cec5SDimitry Andric 
157306c3fb27SDimitry Andric /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
157406c3fb27SDimitry Andric ///    \a __V and returns the 32-bit values in the corresponding elements of a
157506c3fb27SDimitry Andric ///    256-bit vector of [8 x i32].
157606c3fb27SDimitry Andric ///
157706c3fb27SDimitry Andric /// \code{.operation}
157806c3fb27SDimitry Andric /// FOR i := 0 TO 7
157906c3fb27SDimitry Andric ///   j := i*16
158006c3fb27SDimitry Andric ///   k := i*32
158106c3fb27SDimitry Andric ///   result[k+31:k] := ZeroExtend(__V[j+15:j])
158206c3fb27SDimitry Andric /// ENDFOR
158306c3fb27SDimitry Andric /// \endcode
158406c3fb27SDimitry Andric ///
158506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
158606c3fb27SDimitry Andric ///
158706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
158806c3fb27SDimitry Andric ///
158906c3fb27SDimitry Andric /// \param __V
159006c3fb27SDimitry Andric ///    A 128-bit vector of [8 x i16] containing the source values.
159106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
159206c3fb27SDimitry Andric ///    values.
15930b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
15940b57cec5SDimitry Andric _mm256_cvtepu16_epi32(__m128i __V)
15950b57cec5SDimitry Andric {
15960b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
15970b57cec5SDimitry Andric }
15980b57cec5SDimitry Andric 
159906c3fb27SDimitry Andric /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
160006c3fb27SDimitry Andric ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
160106c3fb27SDimitry Andric ///    elements of a 256-bit vector of [4 x i64].
160206c3fb27SDimitry Andric ///
160306c3fb27SDimitry Andric /// \code{.operation}
160406c3fb27SDimitry Andric /// result[63:0] := ZeroExtend(__V[15:0])
160506c3fb27SDimitry Andric /// result[127:64] := ZeroExtend(__V[31:16])
160606c3fb27SDimitry Andric /// result[191:128] := ZeroExtend(__V[47:32])
160706c3fb27SDimitry Andric /// result[255:192] := ZeroExtend(__V[64:48])
160806c3fb27SDimitry Andric /// \endcode
160906c3fb27SDimitry Andric ///
161006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
161106c3fb27SDimitry Andric ///
161206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
161306c3fb27SDimitry Andric ///
161406c3fb27SDimitry Andric /// \param __V
161506c3fb27SDimitry Andric ///    A 128-bit vector of [8 x i16] containing the source values.
161606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
161706c3fb27SDimitry Andric ///    values.
16180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
16190b57cec5SDimitry Andric _mm256_cvtepu16_epi64(__m128i __V)
16200b57cec5SDimitry Andric {
16210b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
16220b57cec5SDimitry Andric }
16230b57cec5SDimitry Andric 
162406c3fb27SDimitry Andric /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
162506c3fb27SDimitry Andric ///    \a __V and returns the 64-bit values in the corresponding elements of a
162606c3fb27SDimitry Andric ///    256-bit vector of [4 x i64].
162706c3fb27SDimitry Andric ///
162806c3fb27SDimitry Andric /// \code{.operation}
162906c3fb27SDimitry Andric /// result[63:0] := ZeroExtend(__V[31:0])
163006c3fb27SDimitry Andric /// result[127:64] := ZeroExtend(__V[63:32])
163106c3fb27SDimitry Andric /// result[191:128] := ZeroExtend(__V[95:64])
163206c3fb27SDimitry Andric /// result[255:192] := ZeroExtend(__V[127:96])
163306c3fb27SDimitry Andric /// \endcode
163406c3fb27SDimitry Andric ///
163506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
163606c3fb27SDimitry Andric ///
163706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
163806c3fb27SDimitry Andric ///
163906c3fb27SDimitry Andric /// \param __V
164006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the source values.
164106c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
164206c3fb27SDimitry Andric ///    values.
16430b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
16440b57cec5SDimitry Andric _mm256_cvtepu32_epi64(__m128i __V)
16450b57cec5SDimitry Andric {
16460b57cec5SDimitry Andric   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
16470b57cec5SDimitry Andric }
16480b57cec5SDimitry Andric 
164906c3fb27SDimitry Andric /// Multiplies signed 32-bit integers from even-numbered elements of two
165006c3fb27SDimitry Andric ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
165106c3fb27SDimitry Andric ///    [4 x i64] result.
165206c3fb27SDimitry Andric ///
165306c3fb27SDimitry Andric /// \code{.operation}
165406c3fb27SDimitry Andric /// result[63:0] := __a[31:0] * __b[31:0]
165506c3fb27SDimitry Andric /// result[127:64] := __a[95:64] * __b[95:64]
165606c3fb27SDimitry Andric /// result[191:128] := __a[159:128] * __b[159:128]
165706c3fb27SDimitry Andric /// result[255:192] := __a[223:192] * __b[223:192]
165806c3fb27SDimitry Andric /// \endcode
165906c3fb27SDimitry Andric ///
166006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
166106c3fb27SDimitry Andric ///
166206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULDQ instruction.
166306c3fb27SDimitry Andric ///
166406c3fb27SDimitry Andric /// \param __a
166506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
166606c3fb27SDimitry Andric /// \param __b
166706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
166806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the products.
16690b57cec5SDimitry Andric static __inline__  __m256i __DEFAULT_FN_ATTRS256
16700b57cec5SDimitry Andric _mm256_mul_epi32(__m256i __a, __m256i __b)
16710b57cec5SDimitry Andric {
16720b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
16730b57cec5SDimitry Andric }
16740b57cec5SDimitry Andric 
167506c3fb27SDimitry Andric /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
167606c3fb27SDimitry Andric ///    [16 x i16], truncates the 32-bit results to the most significant 18
167706c3fb27SDimitry Andric ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
167806c3fb27SDimitry Andric ///    product in the [16 x i16] result.
167906c3fb27SDimitry Andric ///
168006c3fb27SDimitry Andric /// \code{.operation}
168106c3fb27SDimitry Andric /// FOR i := 0 TO 15
168206c3fb27SDimitry Andric ///   j := i*16
168306c3fb27SDimitry Andric ///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
168406c3fb27SDimitry Andric ///   result[j+15:j] := temp[16:1]
168506c3fb27SDimitry Andric /// \endcode
168606c3fb27SDimitry Andric ///
168706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
168806c3fb27SDimitry Andric ///
168906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULHRSW instruction.
169006c3fb27SDimitry Andric ///
169106c3fb27SDimitry Andric /// \param __a
169206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
169306c3fb27SDimitry Andric /// \param __b
169406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
169506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
16960b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
16970b57cec5SDimitry Andric _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
16980b57cec5SDimitry Andric {
16990b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
17000b57cec5SDimitry Andric }
17010b57cec5SDimitry Andric 
170206c3fb27SDimitry Andric /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
170306c3fb27SDimitry Andric ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
170406c3fb27SDimitry Andric ///    [16 x i16] result.
170506c3fb27SDimitry Andric ///
170606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
170706c3fb27SDimitry Andric ///
170806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULHUW instruction.
170906c3fb27SDimitry Andric ///
171006c3fb27SDimitry Andric /// \param __a
171106c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
171206c3fb27SDimitry Andric /// \param __b
171306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
171406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the products.
17150b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
17160b57cec5SDimitry Andric _mm256_mulhi_epu16(__m256i __a, __m256i __b)
17170b57cec5SDimitry Andric {
17180b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
17190b57cec5SDimitry Andric }
17200b57cec5SDimitry Andric 
172106c3fb27SDimitry Andric /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
172206c3fb27SDimitry Andric ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
172306c3fb27SDimitry Andric ///    [16 x i16] result.
172406c3fb27SDimitry Andric ///
172506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
172606c3fb27SDimitry Andric ///
172706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULHW instruction.
172806c3fb27SDimitry Andric ///
172906c3fb27SDimitry Andric /// \param __a
173006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
173106c3fb27SDimitry Andric /// \param __b
173206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
173306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the products.
17340b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
17350b57cec5SDimitry Andric _mm256_mulhi_epi16(__m256i __a, __m256i __b)
17360b57cec5SDimitry Andric {
17370b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
17380b57cec5SDimitry Andric }
17390b57cec5SDimitry Andric 
174006c3fb27SDimitry Andric /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
174106c3fb27SDimitry Andric ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
174206c3fb27SDimitry Andric ///    [16 x i16] result.
174306c3fb27SDimitry Andric ///
174406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
174506c3fb27SDimitry Andric ///
174606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULLW instruction.
174706c3fb27SDimitry Andric ///
174806c3fb27SDimitry Andric /// \param __a
174906c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
175006c3fb27SDimitry Andric /// \param __b
175106c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing one of the source operands.
175206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the products.
17530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
17540b57cec5SDimitry Andric _mm256_mullo_epi16(__m256i __a, __m256i __b)
17550b57cec5SDimitry Andric {
17560b57cec5SDimitry Andric   return (__m256i)((__v16hu)__a * (__v16hu)__b);
17570b57cec5SDimitry Andric }
17580b57cec5SDimitry Andric 
175906c3fb27SDimitry Andric /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
176006c3fb27SDimitry Andric ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
176106c3fb27SDimitry Andric ///    [8 x i32] result.
176206c3fb27SDimitry Andric ///
176306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
176406c3fb27SDimitry Andric ///
176506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULLD instruction.
176606c3fb27SDimitry Andric ///
176706c3fb27SDimitry Andric /// \param __a
176806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
176906c3fb27SDimitry Andric /// \param __b
177006c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
177106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the products.
17720b57cec5SDimitry Andric static __inline__  __m256i __DEFAULT_FN_ATTRS256
17730b57cec5SDimitry Andric _mm256_mullo_epi32 (__m256i __a, __m256i __b)
17740b57cec5SDimitry Andric {
17750b57cec5SDimitry Andric   return (__m256i)((__v8su)__a * (__v8su)__b);
17760b57cec5SDimitry Andric }
17770b57cec5SDimitry Andric 
177806c3fb27SDimitry Andric /// Multiplies unsigned 32-bit integers from even-numered elements of two
177906c3fb27SDimitry Andric ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
178006c3fb27SDimitry Andric ///    [4 x i64] result.
178106c3fb27SDimitry Andric ///
178206c3fb27SDimitry Andric /// \code{.operation}
178306c3fb27SDimitry Andric /// result[63:0] := __a[31:0] * __b[31:0]
178406c3fb27SDimitry Andric /// result[127:64] := __a[95:64] * __b[95:64]
178506c3fb27SDimitry Andric /// result[191:128] := __a[159:128] * __b[159:128]
178606c3fb27SDimitry Andric /// result[255:192] := __a[223:192] * __b[223:192]
178706c3fb27SDimitry Andric /// \endcode
178806c3fb27SDimitry Andric ///
178906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
179006c3fb27SDimitry Andric ///
179106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULUDQ instruction.
179206c3fb27SDimitry Andric ///
179306c3fb27SDimitry Andric /// \param __a
179406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
179506c3fb27SDimitry Andric /// \param __b
179606c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing one of the source operands.
179706c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the products.
17980b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
17990b57cec5SDimitry Andric _mm256_mul_epu32(__m256i __a, __m256i __b)
18000b57cec5SDimitry Andric {
18010b57cec5SDimitry Andric   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
18020b57cec5SDimitry Andric }
18030b57cec5SDimitry Andric 
180406c3fb27SDimitry Andric /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
180506c3fb27SDimitry Andric ///    \a __b.
180606c3fb27SDimitry Andric ///
180706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
180806c3fb27SDimitry Andric ///
180906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPOR instruction.
181006c3fb27SDimitry Andric ///
181106c3fb27SDimitry Andric /// \param __a
181206c3fb27SDimitry Andric ///    A 256-bit integer vector.
181306c3fb27SDimitry Andric /// \param __b
181406c3fb27SDimitry Andric ///    A 256-bit integer vector.
181506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
18160b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
18170b57cec5SDimitry Andric _mm256_or_si256(__m256i __a, __m256i __b)
18180b57cec5SDimitry Andric {
18190b57cec5SDimitry Andric   return (__m256i)((__v4du)__a | (__v4du)__b);
18200b57cec5SDimitry Andric }
18210b57cec5SDimitry Andric 
182206c3fb27SDimitry Andric /// Computes four sum of absolute difference (SAD) operations on sets of eight
182306c3fb27SDimitry Andric ///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
182406c3fb27SDimitry Andric ///    \a __b.
182506c3fb27SDimitry Andric ///
182606c3fb27SDimitry Andric ///    One SAD result is computed for each set of eight bytes from \a __a and
182706c3fb27SDimitry Andric ///    eight bytes from \a __b. The zero-extended SAD value is returned in the
182806c3fb27SDimitry Andric ///    corresponding 64-bit element of the result.
182906c3fb27SDimitry Andric ///
183006c3fb27SDimitry Andric ///    A single SAD operation takes the differences between the corresponding
183106c3fb27SDimitry Andric ///    bytes of \a __a and \a __b, takes the absolute value of each difference,
183206c3fb27SDimitry Andric ///    and sums these eight values to form one 16-bit result. This operation
183306c3fb27SDimitry Andric ///    is repeated four times with successive sets of eight bytes.
183406c3fb27SDimitry Andric ///
183506c3fb27SDimitry Andric /// \code{.operation}
183606c3fb27SDimitry Andric /// FOR i := 0 TO 3
183706c3fb27SDimitry Andric ///   j := i*64
183806c3fb27SDimitry Andric ///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
183906c3fb27SDimitry Andric ///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
184006c3fb27SDimitry Andric ///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
184106c3fb27SDimitry Andric ///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
184206c3fb27SDimitry Andric ///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
184306c3fb27SDimitry Andric ///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
184406c3fb27SDimitry Andric ///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
184506c3fb27SDimitry Andric ///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
184606c3fb27SDimitry Andric ///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
184706c3fb27SDimitry Andric ///                     temp4 + temp5 + temp6 + temp7
184806c3fb27SDimitry Andric ///   result[j+63:j+16] := 0
184906c3fb27SDimitry Andric /// ENDFOR
185006c3fb27SDimitry Andric /// \endcode
185106c3fb27SDimitry Andric ///
185206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
185306c3fb27SDimitry Andric ///
185406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSADBW instruction.
185506c3fb27SDimitry Andric ///
185606c3fb27SDimitry Andric /// \param __a
185706c3fb27SDimitry Andric ///    A 256-bit integer vector.
185806c3fb27SDimitry Andric /// \param __b
185906c3fb27SDimitry Andric ///    A 256-bit integer vector.
186006c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
18610b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
18620b57cec5SDimitry Andric _mm256_sad_epu8(__m256i __a, __m256i __b)
18630b57cec5SDimitry Andric {
18640b57cec5SDimitry Andric   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
18650b57cec5SDimitry Andric }
18660b57cec5SDimitry Andric 
186706c3fb27SDimitry Andric /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
186806c3fb27SDimitry Andric ///    to control information in the 256-bit integer vector \a __b, and
186906c3fb27SDimitry Andric ///    returns the 256-bit result. In effect there are two separate 128-bit
187006c3fb27SDimitry Andric ///    shuffles in the lower and upper halves.
187106c3fb27SDimitry Andric ///
187206c3fb27SDimitry Andric /// \code{.operation}
187306c3fb27SDimitry Andric /// FOR i := 0 TO 31
187406c3fb27SDimitry Andric ///   j := i*8
187506c3fb27SDimitry Andric ///   IF __b[j+7] == 1
187606c3fb27SDimitry Andric ///     result[j+7:j] := 0
187706c3fb27SDimitry Andric ///   ELSE
187806c3fb27SDimitry Andric ///     k := __b[j+3:j] * 8
187906c3fb27SDimitry Andric ///     IF i > 15
188006c3fb27SDimitry Andric ///       k := k + 128
188106c3fb27SDimitry Andric ///     FI
188206c3fb27SDimitry Andric ///     result[j+7:j] := __a[k+7:k]
188306c3fb27SDimitry Andric ///   FI
188406c3fb27SDimitry Andric /// ENDFOR
188506c3fb27SDimitry Andric /// \endcode
188606c3fb27SDimitry Andric ///
188706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
188806c3fb27SDimitry Andric ///
188906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFB instruction.
189006c3fb27SDimitry Andric ///
189106c3fb27SDimitry Andric /// \param __a
189206c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
189306c3fb27SDimitry Andric /// \param __b
189406c3fb27SDimitry Andric ///    A 256-bit integer vector containing control information to determine
189506c3fb27SDimitry Andric ///    what goes into the corresponding byte of the result. If bit 7 of the
189606c3fb27SDimitry Andric ///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
189706c3fb27SDimitry Andric ///    control byte specify the index (within the same 128-bit half) of \a __a
189806c3fb27SDimitry Andric ///    to copy to the result byte.
189906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
19000b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
19010b57cec5SDimitry Andric _mm256_shuffle_epi8(__m256i __a, __m256i __b)
19020b57cec5SDimitry Andric {
19030b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
19040b57cec5SDimitry Andric }
19050b57cec5SDimitry Andric 
190606c3fb27SDimitry Andric /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
190706c3fb27SDimitry Andric ///    according to control information in the integer literal \a imm, and
190806c3fb27SDimitry Andric ///    returns the 256-bit result. In effect there are two parallel 128-bit
190906c3fb27SDimitry Andric ///    shuffles in the lower and upper halves.
191006c3fb27SDimitry Andric ///
191106c3fb27SDimitry Andric /// \code{.operation}
191206c3fb27SDimitry Andric /// FOR i := 0 to 3
191306c3fb27SDimitry Andric ///   j := i*32
191406c3fb27SDimitry Andric ///   k := (imm >> i*2)[1:0] * 32
191506c3fb27SDimitry Andric ///   result[j+31:j] := a[k+31:k]
191606c3fb27SDimitry Andric ///   result[128+j+31:128+j] := a[128+k+31:128+k]
191706c3fb27SDimitry Andric /// ENDFOR
191806c3fb27SDimitry Andric /// \endcode
191906c3fb27SDimitry Andric ///
192006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
192106c3fb27SDimitry Andric ///
192206c3fb27SDimitry Andric /// \code
192306c3fb27SDimitry Andric /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
192406c3fb27SDimitry Andric /// \endcode
192506c3fb27SDimitry Andric ///
192606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFB instruction.
192706c3fb27SDimitry Andric ///
192806c3fb27SDimitry Andric /// \param a
192906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing source values.
193006c3fb27SDimitry Andric /// \param imm
193106c3fb27SDimitry Andric ///    An immediate 8-bit value specifying which elements to copy from \a a.
193206c3fb27SDimitry Andric ///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
193306c3fb27SDimitry Andric ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
193406c3fb27SDimitry Andric ///    forth.
193506c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
19360b57cec5SDimitry Andric #define _mm256_shuffle_epi32(a, imm) \
1937349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
19380b57cec5SDimitry Andric 
193906c3fb27SDimitry Andric /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
194006c3fb27SDimitry Andric ///    according to control information in the integer literal \a imm, and
194106c3fb27SDimitry Andric ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
194206c3fb27SDimitry Andric ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
194306c3fb27SDimitry Andric ///    copied from \a a unchanged.
194406c3fb27SDimitry Andric ///
194506c3fb27SDimitry Andric /// \code{.operation}
194606c3fb27SDimitry Andric /// result[63:0] := a[63:0]
194706c3fb27SDimitry Andric /// result[191:128] := a[191:128]
194806c3fb27SDimitry Andric /// FOR i := 0 TO 3
194906c3fb27SDimitry Andric ///   j := i * 16 + 64
195006c3fb27SDimitry Andric ///   k := (imm >> i*2)[1:0] * 16 + 64
195106c3fb27SDimitry Andric ///   result[j+15:j] := a[k+15:k]
195206c3fb27SDimitry Andric ///   result[128+j+15:128+j] := a[128+k+15:128+k]
195306c3fb27SDimitry Andric /// ENDFOR
195406c3fb27SDimitry Andric /// \endcode
195506c3fb27SDimitry Andric ///
195606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
195706c3fb27SDimitry Andric ///
195806c3fb27SDimitry Andric /// \code
195906c3fb27SDimitry Andric /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
196006c3fb27SDimitry Andric /// \endcode
196106c3fb27SDimitry Andric ///
196206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFHW instruction.
196306c3fb27SDimitry Andric ///
196406c3fb27SDimitry Andric /// \param a
196506c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing source values.
196606c3fb27SDimitry Andric /// \param imm
196706c3fb27SDimitry Andric ///    An immediate 8-bit value specifying which elements to copy from \a a.
196806c3fb27SDimitry Andric ///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
196906c3fb27SDimitry Andric ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
197006c3fb27SDimitry Andric ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
197106c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
19720b57cec5SDimitry Andric #define _mm256_shufflehi_epi16(a, imm) \
1973349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
19740b57cec5SDimitry Andric 
197506c3fb27SDimitry Andric /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
197606c3fb27SDimitry Andric ///    according to control information in the integer literal \a imm, and
197706c3fb27SDimitry Andric ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
197806c3fb27SDimitry Andric ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
197906c3fb27SDimitry Andric ///    copied from \a a unchanged.
198006c3fb27SDimitry Andric ///
198106c3fb27SDimitry Andric /// \code{.operation}
198206c3fb27SDimitry Andric /// result[127:64] := a[127:64]
198306c3fb27SDimitry Andric /// result[255:192] := a[255:192]
198406c3fb27SDimitry Andric /// FOR i := 0 TO 3
198506c3fb27SDimitry Andric ///   j := i * 16
198606c3fb27SDimitry Andric ///   k := (imm >> i*2)[1:0] * 16
198706c3fb27SDimitry Andric ///   result[j+15:j] := a[k+15:k]
198806c3fb27SDimitry Andric ///   result[128+j+15:128+j] := a[128+k+15:128+k]
198906c3fb27SDimitry Andric /// ENDFOR
199006c3fb27SDimitry Andric /// \endcode
199106c3fb27SDimitry Andric ///
199206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
199306c3fb27SDimitry Andric ///
199406c3fb27SDimitry Andric /// \code
199506c3fb27SDimitry Andric /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
199606c3fb27SDimitry Andric /// \endcode
199706c3fb27SDimitry Andric ///
199806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFLW instruction.
199906c3fb27SDimitry Andric ///
200006c3fb27SDimitry Andric /// \param a
200106c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to use as a source of data for the
200206c3fb27SDimitry Andric ///    result.
200306c3fb27SDimitry Andric /// \param imm
200406c3fb27SDimitry Andric ///    An immediate 8-bit value specifying which elements to copy from \a a.
200506c3fb27SDimitry Andric ///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
200606c3fb27SDimitry Andric ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
200706c3fb27SDimitry Andric ///    forth.
200806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
20090b57cec5SDimitry Andric #define _mm256_shufflelo_epi16(a, imm) \
2010349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
20110b57cec5SDimitry Andric 
201206c3fb27SDimitry Andric /// Sets each byte of the result to the corresponding byte of the 256-bit
201306c3fb27SDimitry Andric ///    integer vector in \a __a, the negative of that byte, or zero, depending
201406c3fb27SDimitry Andric ///    on whether the corresponding byte of the 256-bit integer vector in
201506c3fb27SDimitry Andric ///    \a __b is greater than zero, less than zero, or equal to zero,
201606c3fb27SDimitry Andric ///    respectively.
201706c3fb27SDimitry Andric ///
201806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
201906c3fb27SDimitry Andric ///
202006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSIGNB instruction.
202106c3fb27SDimitry Andric ///
202206c3fb27SDimitry Andric /// \param __a
202306c3fb27SDimitry Andric ///    A 256-bit integer vector.
202406c3fb27SDimitry Andric /// \param __b
202506c3fb27SDimitry Andric ///    A 256-bit integer vector].
202606c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
20270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
20280b57cec5SDimitry Andric _mm256_sign_epi8(__m256i __a, __m256i __b)
20290b57cec5SDimitry Andric {
20300b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
20310b57cec5SDimitry Andric }
20320b57cec5SDimitry Andric 
203306c3fb27SDimitry Andric /// Sets each element of the result to the corresponding element of the
203406c3fb27SDimitry Andric ///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
203506c3fb27SDimitry Andric ///    or zero, depending on whether the corresponding element of the 256-bit
203606c3fb27SDimitry Andric ///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
203706c3fb27SDimitry Andric ///    equal to zero, respectively.
203806c3fb27SDimitry Andric ///
203906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
204006c3fb27SDimitry Andric ///
204106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSIGNW instruction.
204206c3fb27SDimitry Andric ///
204306c3fb27SDimitry Andric /// \param __a
204406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
204506c3fb27SDimitry Andric /// \param __b
204606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16].
204706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
20480b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
20490b57cec5SDimitry Andric _mm256_sign_epi16(__m256i __a, __m256i __b)
20500b57cec5SDimitry Andric {
20510b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
20520b57cec5SDimitry Andric }
20530b57cec5SDimitry Andric 
205406c3fb27SDimitry Andric /// Sets each element of the result to the corresponding element of the
205506c3fb27SDimitry Andric ///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
205606c3fb27SDimitry Andric ///    zero, depending on whether the corresponding element of the 256-bit
205706c3fb27SDimitry Andric ///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
205806c3fb27SDimitry Andric ///    equal to zero, respectively.
205906c3fb27SDimitry Andric ///
206006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
206106c3fb27SDimitry Andric ///
206206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSIGND instruction.
206306c3fb27SDimitry Andric ///
206406c3fb27SDimitry Andric /// \param __a
206506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
206606c3fb27SDimitry Andric /// \param __b
206706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32].
206806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
20690b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
20700b57cec5SDimitry Andric _mm256_sign_epi32(__m256i __a, __m256i __b)
20710b57cec5SDimitry Andric {
20720b57cec5SDimitry Andric     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
20730b57cec5SDimitry Andric }
20740b57cec5SDimitry Andric 
207506c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
207606c3fb27SDimitry Andric ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
207706c3fb27SDimitry Andric ///    is greater than 15, the returned result is all zeroes.
207806c3fb27SDimitry Andric ///
207906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
208006c3fb27SDimitry Andric ///
208106c3fb27SDimitry Andric /// \code
208206c3fb27SDimitry Andric /// __m256i _mm256_slli_si256(__m256i a, const int imm);
208306c3fb27SDimitry Andric /// \endcode
208406c3fb27SDimitry Andric ///
208506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLDQ instruction.
208606c3fb27SDimitry Andric ///
208706c3fb27SDimitry Andric /// \param a
208806c3fb27SDimitry Andric ///    A 256-bit integer vector to be shifted.
208906c3fb27SDimitry Andric /// \param imm
209006c3fb27SDimitry Andric ///     An unsigned immediate value specifying the shift count (in bytes).
209106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
20920b57cec5SDimitry Andric #define _mm256_slli_si256(a, imm) \
2093349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
20940b57cec5SDimitry Andric 
209506c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
209606c3fb27SDimitry Andric ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
209706c3fb27SDimitry Andric ///    is greater than 15, the returned result is all zeroes.
209806c3fb27SDimitry Andric ///
209906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
210006c3fb27SDimitry Andric ///
210106c3fb27SDimitry Andric /// \code
210206c3fb27SDimitry Andric /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
210306c3fb27SDimitry Andric /// \endcode
210406c3fb27SDimitry Andric ///
210506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLDQ instruction.
210606c3fb27SDimitry Andric ///
210706c3fb27SDimitry Andric /// \param a
210806c3fb27SDimitry Andric ///    A 256-bit integer vector to be shifted.
210906c3fb27SDimitry Andric /// \param imm
211006c3fb27SDimitry Andric ///    An unsigned immediate value specifying the shift count (in bytes).
211106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
21120b57cec5SDimitry Andric #define _mm256_bslli_epi128(a, imm) \
2113349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
21140b57cec5SDimitry Andric 
211506c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
211606c3fb27SDimitry Andric ///    left by \a __count bits, shifting in zero bits, and returns the result.
211706c3fb27SDimitry Andric ///    If \a __count is greater than 15, the returned result is all zeroes.
211806c3fb27SDimitry Andric ///
211906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
212006c3fb27SDimitry Andric ///
212106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLW instruction.
212206c3fb27SDimitry Andric ///
212306c3fb27SDimitry Andric /// \param __a
212406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to be shifted.
212506c3fb27SDimitry Andric /// \param __count
212606c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
212706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
21280b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
21290b57cec5SDimitry Andric _mm256_slli_epi16(__m256i __a, int __count)
21300b57cec5SDimitry Andric {
21310b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
21320b57cec5SDimitry Andric }
21330b57cec5SDimitry Andric 
213406c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
213506c3fb27SDimitry Andric ///    left by the number of bits specified by the lower 64 bits of \a __count,
213606c3fb27SDimitry Andric ///    shifting in zero bits, and returns the result. If \a __count is greater
213706c3fb27SDimitry Andric ///    than 15, the returned result is all zeroes.
213806c3fb27SDimitry Andric ///
213906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
214006c3fb27SDimitry Andric ///
214106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLW instruction.
214206c3fb27SDimitry Andric ///
214306c3fb27SDimitry Andric /// \param __a
214406c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to be shifted.
214506c3fb27SDimitry Andric /// \param __count
214606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
214706c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
214806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
21490b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
21500b57cec5SDimitry Andric _mm256_sll_epi16(__m256i __a, __m128i __count)
21510b57cec5SDimitry Andric {
21520b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
21530b57cec5SDimitry Andric }
21540b57cec5SDimitry Andric 
215506c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
215606c3fb27SDimitry Andric ///    left by \a __count bits, shifting in zero bits, and returns the result.
215706c3fb27SDimitry Andric ///    If \a __count is greater than 31, the returned result is all zeroes.
215806c3fb27SDimitry Andric ///
215906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
216006c3fb27SDimitry Andric ///
216106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLD instruction.
216206c3fb27SDimitry Andric ///
216306c3fb27SDimitry Andric /// \param __a
216406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
216506c3fb27SDimitry Andric /// \param __count
216606c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
216706c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
21680b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
21690b57cec5SDimitry Andric _mm256_slli_epi32(__m256i __a, int __count)
21700b57cec5SDimitry Andric {
21710b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
21720b57cec5SDimitry Andric }
21730b57cec5SDimitry Andric 
217406c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
217506c3fb27SDimitry Andric ///    left by the number of bits given in the lower 64 bits of \a __count,
217606c3fb27SDimitry Andric ///    shifting in zero bits, and returns the result. If \a __count is greater
217706c3fb27SDimitry Andric ///    than 31, the returned result is all zeroes.
217806c3fb27SDimitry Andric ///
217906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
218006c3fb27SDimitry Andric ///
218106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLD instruction.
218206c3fb27SDimitry Andric ///
218306c3fb27SDimitry Andric /// \param __a
218406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
218506c3fb27SDimitry Andric /// \param __count
218606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
218706c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
218806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
21890b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
21900b57cec5SDimitry Andric _mm256_sll_epi32(__m256i __a, __m128i __count)
21910b57cec5SDimitry Andric {
21920b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
21930b57cec5SDimitry Andric }
21940b57cec5SDimitry Andric 
219506c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
219606c3fb27SDimitry Andric ///    left by \a __count bits, shifting in zero bits, and returns the result.
219706c3fb27SDimitry Andric ///    If \a __count is greater than 63, the returned result is all zeroes.
219806c3fb27SDimitry Andric ///
219906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
220006c3fb27SDimitry Andric ///
220106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLQ instruction.
220206c3fb27SDimitry Andric ///
220306c3fb27SDimitry Andric /// \param __a
220406c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] to be shifted.
220506c3fb27SDimitry Andric /// \param __count
220606c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
220706c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
22080b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
22090b57cec5SDimitry Andric _mm256_slli_epi64(__m256i __a, int __count)
22100b57cec5SDimitry Andric {
22110b57cec5SDimitry Andric   return __builtin_ia32_psllqi256((__v4di)__a, __count);
22120b57cec5SDimitry Andric }
22130b57cec5SDimitry Andric 
221406c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
221506c3fb27SDimitry Andric ///    left by the number of bits given in the lower 64 bits of \a __count,
221606c3fb27SDimitry Andric ///    shifting in zero bits, and returns the result. If \a __count is greater
221706c3fb27SDimitry Andric ///    than 63, the returned result is all zeroes.
221806c3fb27SDimitry Andric ///
221906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
222006c3fb27SDimitry Andric ///
222106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLQ instruction.
222206c3fb27SDimitry Andric ///
222306c3fb27SDimitry Andric /// \param __a
222406c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] to be shifted.
222506c3fb27SDimitry Andric /// \param __count
222606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
222706c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
222806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
22290b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
22300b57cec5SDimitry Andric _mm256_sll_epi64(__m256i __a, __m128i __count)
22310b57cec5SDimitry Andric {
22320b57cec5SDimitry Andric   return __builtin_ia32_psllq256((__v4di)__a, __count);
22330b57cec5SDimitry Andric }
22340b57cec5SDimitry Andric 
223506c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
223606c3fb27SDimitry Andric ///    right by \a __count bits, shifting in sign bits, and returns the result.
223706c3fb27SDimitry Andric ///    If \a __count is greater than 15, each element of the result is either
223806c3fb27SDimitry Andric ///    0 or -1 according to the corresponding input sign bit.
223906c3fb27SDimitry Andric ///
224006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
224106c3fb27SDimitry Andric ///
224206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAW instruction.
224306c3fb27SDimitry Andric ///
224406c3fb27SDimitry Andric /// \param __a
224506c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to be shifted.
224606c3fb27SDimitry Andric /// \param __count
224706c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
224806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
22490b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
22500b57cec5SDimitry Andric _mm256_srai_epi16(__m256i __a, int __count)
22510b57cec5SDimitry Andric {
22520b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
22530b57cec5SDimitry Andric }
22540b57cec5SDimitry Andric 
225506c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
225606c3fb27SDimitry Andric ///    right by the number of bits given in the lower 64 bits of \a __count,
225706c3fb27SDimitry Andric ///    shifting in sign bits, and returns the result. If \a __count is greater
225806c3fb27SDimitry Andric ///    than 15, each element of the result is either 0 or -1 according to the
225906c3fb27SDimitry Andric ///    corresponding input sign bit.
226006c3fb27SDimitry Andric ///
226106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
226206c3fb27SDimitry Andric ///
226306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAW instruction.
226406c3fb27SDimitry Andric ///
226506c3fb27SDimitry Andric /// \param __a
226606c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to be shifted.
226706c3fb27SDimitry Andric /// \param __count
226806c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
226906c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
227006c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
22710b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
22720b57cec5SDimitry Andric _mm256_sra_epi16(__m256i __a, __m128i __count)
22730b57cec5SDimitry Andric {
22740b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
22750b57cec5SDimitry Andric }
22760b57cec5SDimitry Andric 
227706c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
227806c3fb27SDimitry Andric ///    right by \a __count bits, shifting in sign bits, and returns the result.
227906c3fb27SDimitry Andric ///    If \a __count is greater than 31, each element of the result is either
228006c3fb27SDimitry Andric ///    0 or -1 according to the corresponding input sign bit.
228106c3fb27SDimitry Andric ///
228206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
228306c3fb27SDimitry Andric ///
228406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAD instruction.
228506c3fb27SDimitry Andric ///
228606c3fb27SDimitry Andric /// \param __a
228706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
228806c3fb27SDimitry Andric /// \param __count
228906c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
229006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
22910b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
22920b57cec5SDimitry Andric _mm256_srai_epi32(__m256i __a, int __count)
22930b57cec5SDimitry Andric {
22940b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
22950b57cec5SDimitry Andric }
22960b57cec5SDimitry Andric 
229706c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
229806c3fb27SDimitry Andric ///    right by the number of bits given in the lower 64 bits of \a __count,
229906c3fb27SDimitry Andric ///    shifting in sign bits, and returns the result. If \a __count is greater
230006c3fb27SDimitry Andric ///    than 31, each element of the result is either 0 or -1 according to the
230106c3fb27SDimitry Andric ///    corresponding input sign bit.
230206c3fb27SDimitry Andric ///
230306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
230406c3fb27SDimitry Andric ///
230506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAD instruction.
230606c3fb27SDimitry Andric ///
230706c3fb27SDimitry Andric /// \param __a
230806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
230906c3fb27SDimitry Andric /// \param __count
231006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
231106c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
231206c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
23130b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
23140b57cec5SDimitry Andric _mm256_sra_epi32(__m256i __a, __m128i __count)
23150b57cec5SDimitry Andric {
23160b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
23170b57cec5SDimitry Andric }
23180b57cec5SDimitry Andric 
231906c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
232006c3fb27SDimitry Andric ///    \a imm bytes, shifting in zero bytes, and returns the result. If
232106c3fb27SDimitry Andric ///    \a imm is greater than 15, the returned result is all zeroes.
232206c3fb27SDimitry Andric ///
232306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
232406c3fb27SDimitry Andric ///
232506c3fb27SDimitry Andric /// \code
232606c3fb27SDimitry Andric /// __m256i _mm256_srli_si256(__m256i a, const int imm);
232706c3fb27SDimitry Andric /// \endcode
232806c3fb27SDimitry Andric ///
232906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLDQ instruction.
233006c3fb27SDimitry Andric ///
233106c3fb27SDimitry Andric /// \param a
233206c3fb27SDimitry Andric ///    A 256-bit integer vector to be shifted.
233306c3fb27SDimitry Andric /// \param imm
233406c3fb27SDimitry Andric ///    An unsigned immediate value specifying the shift count (in bytes).
233506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
23360b57cec5SDimitry Andric #define _mm256_srli_si256(a, imm) \
2337349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
23380b57cec5SDimitry Andric 
233906c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
234006c3fb27SDimitry Andric ///    \a imm bytes, shifting in zero bytes, and returns the result. If
234106c3fb27SDimitry Andric ///    \a imm is greater than 15, the returned result is all zeroes.
234206c3fb27SDimitry Andric ///
234306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
234406c3fb27SDimitry Andric ///
234506c3fb27SDimitry Andric /// \code
234606c3fb27SDimitry Andric /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
234706c3fb27SDimitry Andric /// \endcode
234806c3fb27SDimitry Andric ///
234906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLDQ instruction.
235006c3fb27SDimitry Andric ///
235106c3fb27SDimitry Andric /// \param a
235206c3fb27SDimitry Andric ///    A 256-bit integer vector to be shifted.
235306c3fb27SDimitry Andric /// \param imm
235406c3fb27SDimitry Andric ///     An unsigned immediate value specifying the shift count (in bytes).
235506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
23560b57cec5SDimitry Andric #define _mm256_bsrli_epi128(a, imm) \
2357349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
23580b57cec5SDimitry Andric 
235906c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
236006c3fb27SDimitry Andric ///    right by \a __count bits, shifting in zero bits, and returns the result.
236106c3fb27SDimitry Andric ///    If \a __count is greater than 15, the returned result is all zeroes.
236206c3fb27SDimitry Andric ///
236306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
236406c3fb27SDimitry Andric ///
236506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLW instruction.
236606c3fb27SDimitry Andric ///
236706c3fb27SDimitry Andric /// \param __a
236806c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to be shifted.
236906c3fb27SDimitry Andric /// \param __count
237006c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
237106c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
23720b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
23730b57cec5SDimitry Andric _mm256_srli_epi16(__m256i __a, int __count)
23740b57cec5SDimitry Andric {
23750b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
23760b57cec5SDimitry Andric }
23770b57cec5SDimitry Andric 
237806c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
237906c3fb27SDimitry Andric ///    right by the number of bits given in the lower 64 bits of \a __count,
238006c3fb27SDimitry Andric ///    shifting in zero bits, and returns the result. If \a __count is greater
238106c3fb27SDimitry Andric ///    than 15, the returned result is all zeroes.
238206c3fb27SDimitry Andric ///
238306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
238406c3fb27SDimitry Andric ///
238506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLW instruction.
238606c3fb27SDimitry Andric ///
238706c3fb27SDimitry Andric /// \param __a
238806c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] to be shifted.
238906c3fb27SDimitry Andric /// \param __count
239006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
239106c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
239206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
23930b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
23940b57cec5SDimitry Andric _mm256_srl_epi16(__m256i __a, __m128i __count)
23950b57cec5SDimitry Andric {
23960b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
23970b57cec5SDimitry Andric }
23980b57cec5SDimitry Andric 
239906c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
240006c3fb27SDimitry Andric ///    right by \a __count bits, shifting in zero bits, and returns the result.
240106c3fb27SDimitry Andric ///    If \a __count is greater than 31, the returned result is all zeroes.
240206c3fb27SDimitry Andric ///
240306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
240406c3fb27SDimitry Andric ///
240506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLD instruction.
240606c3fb27SDimitry Andric ///
240706c3fb27SDimitry Andric /// \param __a
240806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
240906c3fb27SDimitry Andric /// \param __count
241006c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
241106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
24120b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
24130b57cec5SDimitry Andric _mm256_srli_epi32(__m256i __a, int __count)
24140b57cec5SDimitry Andric {
24150b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
24160b57cec5SDimitry Andric }
24170b57cec5SDimitry Andric 
241806c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
241906c3fb27SDimitry Andric ///    right by the number of bits given in the lower 64 bits of \a __count,
242006c3fb27SDimitry Andric ///    shifting in zero bits, and returns the result. If \a __count is greater
242106c3fb27SDimitry Andric ///    than 31, the returned result is all zeroes.
242206c3fb27SDimitry Andric ///
242306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
242406c3fb27SDimitry Andric ///
242506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLD instruction.
242606c3fb27SDimitry Andric ///
242706c3fb27SDimitry Andric /// \param __a
242806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
242906c3fb27SDimitry Andric /// \param __count
243006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
243106c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
243206c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
24330b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
24340b57cec5SDimitry Andric _mm256_srl_epi32(__m256i __a, __m128i __count)
24350b57cec5SDimitry Andric {
24360b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
24370b57cec5SDimitry Andric }
24380b57cec5SDimitry Andric 
243906c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
244006c3fb27SDimitry Andric ///    right by \a __count bits, shifting in zero bits, and returns the result.
244106c3fb27SDimitry Andric ///    If \a __count is greater than 63, the returned result is all zeroes.
244206c3fb27SDimitry Andric ///
244306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
244406c3fb27SDimitry Andric ///
244506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLQ instruction.
244606c3fb27SDimitry Andric ///
244706c3fb27SDimitry Andric /// \param __a
244806c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] to be shifted.
244906c3fb27SDimitry Andric /// \param __count
245006c3fb27SDimitry Andric ///    An unsigned integer value specifying the shift count (in bits).
245106c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
24520b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
24530b57cec5SDimitry Andric _mm256_srli_epi64(__m256i __a, int __count)
24540b57cec5SDimitry Andric {
24550b57cec5SDimitry Andric   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
24560b57cec5SDimitry Andric }
24570b57cec5SDimitry Andric 
245806c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
245906c3fb27SDimitry Andric ///    right by the number of bits given in the lower 64 bits of \a __count,
246006c3fb27SDimitry Andric ///    shifting in zero bits, and returns the result. If \a __count is greater
246106c3fb27SDimitry Andric ///    than 63, the returned result is all zeroes.
246206c3fb27SDimitry Andric ///
246306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
246406c3fb27SDimitry Andric ///
246506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLQ instruction.
246606c3fb27SDimitry Andric ///
246706c3fb27SDimitry Andric /// \param __a
246806c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] to be shifted.
246906c3fb27SDimitry Andric /// \param __count
247006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
247106c3fb27SDimitry Andric ///    shift count (in bits). The upper element is ignored.
247206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
24730b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
24740b57cec5SDimitry Andric _mm256_srl_epi64(__m256i __a, __m128i __count)
24750b57cec5SDimitry Andric {
24760b57cec5SDimitry Andric   return __builtin_ia32_psrlq256((__v4di)__a, __count);
24770b57cec5SDimitry Andric }
24780b57cec5SDimitry Andric 
247906c3fb27SDimitry Andric /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
248006c3fb27SDimitry Andric ///    vectors. Returns the lower 8 bits of each difference in the
248106c3fb27SDimitry Andric ///    corresponding byte of the 256-bit integer vector result (overflow is
248206c3fb27SDimitry Andric ///    ignored).
248306c3fb27SDimitry Andric ///
248406c3fb27SDimitry Andric /// \code{.operation}
248506c3fb27SDimitry Andric /// FOR i := 0 TO 31
248606c3fb27SDimitry Andric ///   j := i*8
248706c3fb27SDimitry Andric ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
248806c3fb27SDimitry Andric /// ENDFOR
248906c3fb27SDimitry Andric /// \endcode
249006c3fb27SDimitry Andric ///
249106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
249206c3fb27SDimitry Andric ///
249306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBB instruction.
249406c3fb27SDimitry Andric ///
249506c3fb27SDimitry Andric /// \param __a
249606c3fb27SDimitry Andric ///    A 256-bit integer vector containing the minuends.
249706c3fb27SDimitry Andric /// \param __b
249806c3fb27SDimitry Andric ///    A 256-bit integer vector containing the subtrahends.
249906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the differences.
25000b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
25010b57cec5SDimitry Andric _mm256_sub_epi8(__m256i __a, __m256i __b)
25020b57cec5SDimitry Andric {
25030b57cec5SDimitry Andric   return (__m256i)((__v32qu)__a - (__v32qu)__b);
25040b57cec5SDimitry Andric }
25050b57cec5SDimitry Andric 
250606c3fb27SDimitry Andric /// Subtracts 16-bit integers from corresponding elements of two 256-bit
250706c3fb27SDimitry Andric ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
250806c3fb27SDimitry Andric ///    the corresponding element of the [16 x i16] result (overflow is
250906c3fb27SDimitry Andric ///    ignored).
251006c3fb27SDimitry Andric ///
251106c3fb27SDimitry Andric /// \code{.operation}
251206c3fb27SDimitry Andric /// FOR i := 0 TO 15
251306c3fb27SDimitry Andric ///   j := i*16
251406c3fb27SDimitry Andric ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
251506c3fb27SDimitry Andric /// ENDFOR
251606c3fb27SDimitry Andric /// \endcode
251706c3fb27SDimitry Andric ///
251806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
251906c3fb27SDimitry Andric ///
252006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBW instruction.
252106c3fb27SDimitry Andric ///
252206c3fb27SDimitry Andric /// \param __a
252306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing the minuends.
252406c3fb27SDimitry Andric /// \param __b
252506c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing the subtrahends.
252606c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences.
25270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
25280b57cec5SDimitry Andric _mm256_sub_epi16(__m256i __a, __m256i __b)
25290b57cec5SDimitry Andric {
25300b57cec5SDimitry Andric   return (__m256i)((__v16hu)__a - (__v16hu)__b);
25310b57cec5SDimitry Andric }
25320b57cec5SDimitry Andric 
253306c3fb27SDimitry Andric /// Subtracts 32-bit integers from corresponding elements of two 256-bit
253406c3fb27SDimitry Andric ///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
253506c3fb27SDimitry Andric ///    the corresponding element of the [8 x i32] result (overflow is ignored).
253606c3fb27SDimitry Andric ///
253706c3fb27SDimitry Andric /// \code{.operation}
253806c3fb27SDimitry Andric /// FOR i := 0 TO 7
253906c3fb27SDimitry Andric ///   j := i*32
254006c3fb27SDimitry Andric ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
254106c3fb27SDimitry Andric /// ENDFOR
254206c3fb27SDimitry Andric /// \endcode
254306c3fb27SDimitry Andric ///
254406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
254506c3fb27SDimitry Andric ///
254606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBD instruction.
254706c3fb27SDimitry Andric ///
254806c3fb27SDimitry Andric /// \param __a
254906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the minuends.
255006c3fb27SDimitry Andric /// \param __b
255106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the subtrahends.
255206c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the differences.
25530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
25540b57cec5SDimitry Andric _mm256_sub_epi32(__m256i __a, __m256i __b)
25550b57cec5SDimitry Andric {
25560b57cec5SDimitry Andric   return (__m256i)((__v8su)__a - (__v8su)__b);
25570b57cec5SDimitry Andric }
25580b57cec5SDimitry Andric 
255906c3fb27SDimitry Andric /// Subtracts 64-bit integers from corresponding elements of two 256-bit
256006c3fb27SDimitry Andric ///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
256106c3fb27SDimitry Andric ///    the corresponding element of the [4 x i64] result (overflow is ignored).
256206c3fb27SDimitry Andric ///
256306c3fb27SDimitry Andric /// \code{.operation}
256406c3fb27SDimitry Andric /// FOR i := 0 TO 3
256506c3fb27SDimitry Andric ///   j := i*64
256606c3fb27SDimitry Andric ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
256706c3fb27SDimitry Andric /// ENDFOR
256806c3fb27SDimitry Andric /// \endcode
256906c3fb27SDimitry Andric ///
257006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
257106c3fb27SDimitry Andric ///
257206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBQ instruction.
257306c3fb27SDimitry Andric ///
257406c3fb27SDimitry Andric /// \param __a
257506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the minuends.
257606c3fb27SDimitry Andric /// \param __b
257706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the subtrahends.
257806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the differences.
25790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
25800b57cec5SDimitry Andric _mm256_sub_epi64(__m256i __a, __m256i __b)
25810b57cec5SDimitry Andric {
25820b57cec5SDimitry Andric   return (__m256i)((__v4du)__a - (__v4du)__b);
25830b57cec5SDimitry Andric }
25840b57cec5SDimitry Andric 
258506c3fb27SDimitry Andric /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
258606c3fb27SDimitry Andric ///    vectors using signed saturation, and returns each differences in the
258706c3fb27SDimitry Andric ///    corresponding byte of the 256-bit integer vector result.
258806c3fb27SDimitry Andric ///
258906c3fb27SDimitry Andric /// \code{.operation}
259006c3fb27SDimitry Andric /// FOR i := 0 TO 31
259106c3fb27SDimitry Andric ///   j := i*8
259206c3fb27SDimitry Andric ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
259306c3fb27SDimitry Andric /// ENDFOR
259406c3fb27SDimitry Andric /// \endcode
259506c3fb27SDimitry Andric ///
259606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
259706c3fb27SDimitry Andric ///
259806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBSB instruction.
259906c3fb27SDimitry Andric ///
260006c3fb27SDimitry Andric /// \param __a
260106c3fb27SDimitry Andric ///    A 256-bit integer vector containing the minuends.
260206c3fb27SDimitry Andric /// \param __b
260306c3fb27SDimitry Andric ///    A 256-bit integer vector containing the subtrahends.
260406c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the differences.
26050b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
26060b57cec5SDimitry Andric _mm256_subs_epi8(__m256i __a, __m256i __b)
26070b57cec5SDimitry Andric {
260881ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
26090b57cec5SDimitry Andric }
26100b57cec5SDimitry Andric 
261106c3fb27SDimitry Andric /// Subtracts 16-bit integers from corresponding elements of two 256-bit
261206c3fb27SDimitry Andric ///    vectors of [16 x i16] using signed saturation, and returns each
261306c3fb27SDimitry Andric ///    difference in the corresponding element of the [16 x i16] result.
261406c3fb27SDimitry Andric ///
261506c3fb27SDimitry Andric /// \code{.operation}
261606c3fb27SDimitry Andric /// FOR i := 0 TO 15
261706c3fb27SDimitry Andric ///   j := i*16
261806c3fb27SDimitry Andric ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
261906c3fb27SDimitry Andric /// ENDFOR
262006c3fb27SDimitry Andric /// \endcode
262106c3fb27SDimitry Andric ///
262206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
262306c3fb27SDimitry Andric ///
262406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBSW instruction.
262506c3fb27SDimitry Andric ///
262606c3fb27SDimitry Andric /// \param __a
262706c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing the minuends.
262806c3fb27SDimitry Andric /// \param __b
262906c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing the subtrahends.
263006c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences.
26310b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
26320b57cec5SDimitry Andric _mm256_subs_epi16(__m256i __a, __m256i __b)
26330b57cec5SDimitry Andric {
263481ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
26350b57cec5SDimitry Andric }
26360b57cec5SDimitry Andric 
263706c3fb27SDimitry Andric /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
263806c3fb27SDimitry Andric ///    vectors using unsigned saturation, and returns each difference in the
263906c3fb27SDimitry Andric ///    corresponding byte of the 256-bit integer vector result. For each byte,
264006c3fb27SDimitry Andric ///    computes <c> result = __a - __b </c>.
264106c3fb27SDimitry Andric ///
264206c3fb27SDimitry Andric /// \code{.operation}
264306c3fb27SDimitry Andric /// FOR i := 0 TO 31
264406c3fb27SDimitry Andric ///   j := i*8
264506c3fb27SDimitry Andric ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
264606c3fb27SDimitry Andric /// ENDFOR
264706c3fb27SDimitry Andric /// \endcode
264806c3fb27SDimitry Andric ///
264906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
265006c3fb27SDimitry Andric ///
265106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBUSB instruction.
265206c3fb27SDimitry Andric ///
265306c3fb27SDimitry Andric /// \param __a
265406c3fb27SDimitry Andric ///    A 256-bit integer vector containing the minuends.
265506c3fb27SDimitry Andric /// \param __b
265606c3fb27SDimitry Andric ///    A 256-bit integer vector containing the subtrahends.
265706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the differences.
26580b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
26590b57cec5SDimitry Andric _mm256_subs_epu8(__m256i __a, __m256i __b)
26600b57cec5SDimitry Andric {
266181ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
26620b57cec5SDimitry Andric }
26630b57cec5SDimitry Andric 
266406c3fb27SDimitry Andric /// Subtracts 16-bit integers from corresponding elements of two 256-bit
266506c3fb27SDimitry Andric ///    vectors of [16 x i16] using unsigned saturation, and returns each
266606c3fb27SDimitry Andric ///    difference in the corresponding element of the [16 x i16] result.
266706c3fb27SDimitry Andric ///
266806c3fb27SDimitry Andric /// \code{.operation}
266906c3fb27SDimitry Andric /// FOR i := 0 TO 15
267006c3fb27SDimitry Andric ///   j := i*16
267106c3fb27SDimitry Andric ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
267206c3fb27SDimitry Andric /// ENDFOR
267306c3fb27SDimitry Andric /// \endcode
267406c3fb27SDimitry Andric ///
267506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
267606c3fb27SDimitry Andric ///
267706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBUSW instruction.
267806c3fb27SDimitry Andric ///
267906c3fb27SDimitry Andric /// \param __a
268006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing the minuends.
268106c3fb27SDimitry Andric /// \param __b
268206c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] containing the subtrahends.
268306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences.
26840b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
26850b57cec5SDimitry Andric _mm256_subs_epu16(__m256i __a, __m256i __b)
26860b57cec5SDimitry Andric {
268781ad6265SDimitry Andric   return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
26880b57cec5SDimitry Andric }
26890b57cec5SDimitry Andric 
269006c3fb27SDimitry Andric /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
269106c3fb27SDimitry Andric ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
269206c3fb27SDimitry Andric ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
269306c3fb27SDimitry Andric ///    input; other bits in these parameters are ignored.
269406c3fb27SDimitry Andric ///
269506c3fb27SDimitry Andric /// \code{.operation}
269606c3fb27SDimitry Andric /// result[7:0] := __a[71:64]
269706c3fb27SDimitry Andric /// result[15:8] := __b[71:64]
269806c3fb27SDimitry Andric /// result[23:16] := __a[79:72]
269906c3fb27SDimitry Andric /// result[31:24] := __b[79:72]
270006c3fb27SDimitry Andric /// . . .
270106c3fb27SDimitry Andric /// result[127:120] := __b[127:120]
270206c3fb27SDimitry Andric /// result[135:128] := __a[199:192]
270306c3fb27SDimitry Andric /// . . .
270406c3fb27SDimitry Andric /// result[255:248] := __b[255:248]
270506c3fb27SDimitry Andric /// \endcode
270606c3fb27SDimitry Andric ///
270706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
270806c3fb27SDimitry Andric ///
270906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
271006c3fb27SDimitry Andric ///
271106c3fb27SDimitry Andric /// \param __a
271206c3fb27SDimitry Andric ///    A 256-bit integer vector used as the source for the even-numbered bytes
271306c3fb27SDimitry Andric ///    of the result.
271406c3fb27SDimitry Andric /// \param __b
271506c3fb27SDimitry Andric ///    A 256-bit integer vector used as the source for the odd-numbered bytes
271606c3fb27SDimitry Andric ///    of the result.
271706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
27180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
27190b57cec5SDimitry Andric _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
27200b57cec5SDimitry Andric {
27210b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
27220b57cec5SDimitry Andric }
27230b57cec5SDimitry Andric 
272406c3fb27SDimitry Andric /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
272506c3fb27SDimitry Andric ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
272606c3fb27SDimitry Andric ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
272706c3fb27SDimitry Andric ///    128-bit half of \a __a and \a __b as input; other bits in these
272806c3fb27SDimitry Andric ///    parameters are ignored.
272906c3fb27SDimitry Andric ///
273006c3fb27SDimitry Andric /// \code{.operation}
273106c3fb27SDimitry Andric /// result[15:0] := __a[79:64]
273206c3fb27SDimitry Andric /// result[31:16] := __b[79:64]
273306c3fb27SDimitry Andric /// result[47:32] := __a[95:80]
273406c3fb27SDimitry Andric /// result[63:48] := __b[95:80]
273506c3fb27SDimitry Andric /// . . .
273606c3fb27SDimitry Andric /// result[127:112] := __b[127:112]
273706c3fb27SDimitry Andric /// result[143:128] := __a[211:196]
273806c3fb27SDimitry Andric /// . . .
273906c3fb27SDimitry Andric /// result[255:240] := __b[255:240]
274006c3fb27SDimitry Andric /// \endcode
274106c3fb27SDimitry Andric ///
274206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
274306c3fb27SDimitry Andric ///
274406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
274506c3fb27SDimitry Andric ///
274606c3fb27SDimitry Andric /// \param __a
274706c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
274806c3fb27SDimitry Andric ///    elements of the result.
274906c3fb27SDimitry Andric /// \param __b
275006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
275106c3fb27SDimitry Andric ///    elements of the result.
275206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
27530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
27540b57cec5SDimitry Andric _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
27550b57cec5SDimitry Andric {
27560b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
27570b57cec5SDimitry Andric }
27580b57cec5SDimitry Andric 
275906c3fb27SDimitry Andric /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
276006c3fb27SDimitry Andric ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
276106c3fb27SDimitry Andric ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
276206c3fb27SDimitry Andric ///    of \a __a and \a __b as input; other bits in these parameters are
276306c3fb27SDimitry Andric ///    ignored.
276406c3fb27SDimitry Andric ///
276506c3fb27SDimitry Andric /// \code{.operation}
276606c3fb27SDimitry Andric /// result[31:0] := __a[95:64]
276706c3fb27SDimitry Andric /// result[63:32] := __b[95:64]
276806c3fb27SDimitry Andric /// result[95:64] := __a[127:96]
276906c3fb27SDimitry Andric /// result[127:96] := __b[127:96]
277006c3fb27SDimitry Andric /// result[159:128] := __a[223:192]
277106c3fb27SDimitry Andric /// result[191:160] := __b[223:192]
277206c3fb27SDimitry Andric /// result[223:192] := __a[255:224]
277306c3fb27SDimitry Andric /// result[255:224] := __b[255:224]
277406c3fb27SDimitry Andric /// \endcode
277506c3fb27SDimitry Andric ///
277606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
277706c3fb27SDimitry Andric ///
277806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
277906c3fb27SDimitry Andric ///
278006c3fb27SDimitry Andric /// \param __a
278106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
278206c3fb27SDimitry Andric ///    elements of the result.
278306c3fb27SDimitry Andric /// \param __b
278406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
278506c3fb27SDimitry Andric ///    elements of the result.
278606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
27870b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
27880b57cec5SDimitry Andric _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
27890b57cec5SDimitry Andric {
27900b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
27910b57cec5SDimitry Andric }
27920b57cec5SDimitry Andric 
279306c3fb27SDimitry Andric /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
279406c3fb27SDimitry Andric ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
279506c3fb27SDimitry Andric ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
279606c3fb27SDimitry Andric ///    of \a __a and \a __b as input; other bits in these parameters are
279706c3fb27SDimitry Andric ///    ignored.
279806c3fb27SDimitry Andric ///
279906c3fb27SDimitry Andric /// \code{.operation}
280006c3fb27SDimitry Andric /// result[63:0] := __a[127:64]
280106c3fb27SDimitry Andric /// result[127:64] := __b[127:64]
280206c3fb27SDimitry Andric /// result[191:128] := __a[255:192]
280306c3fb27SDimitry Andric /// result[255:192] := __b[255:192]
280406c3fb27SDimitry Andric /// \endcode
280506c3fb27SDimitry Andric ///
280606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
280706c3fb27SDimitry Andric ///
280806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
280906c3fb27SDimitry Andric ///
281006c3fb27SDimitry Andric /// \param __a
281106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
281206c3fb27SDimitry Andric ///    elements of the result.
281306c3fb27SDimitry Andric /// \param __b
281406c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
281506c3fb27SDimitry Andric ///    elements of the result.
281606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
28170b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
28180b57cec5SDimitry Andric _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
28190b57cec5SDimitry Andric {
28200b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
28210b57cec5SDimitry Andric }
28220b57cec5SDimitry Andric 
282306c3fb27SDimitry Andric /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
282406c3fb27SDimitry Andric ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
282506c3fb27SDimitry Andric ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
282606c3fb27SDimitry Andric ///    input; other bits in these parameters are ignored.
282706c3fb27SDimitry Andric ///
282806c3fb27SDimitry Andric /// \code{.operation}
282906c3fb27SDimitry Andric /// result[7:0] := __a[7:0]
283006c3fb27SDimitry Andric /// result[15:8] := __b[7:0]
283106c3fb27SDimitry Andric /// result[23:16] := __a[15:8]
283206c3fb27SDimitry Andric /// result[31:24] := __b[15:8]
283306c3fb27SDimitry Andric /// . . .
283406c3fb27SDimitry Andric /// result[127:120] := __b[63:56]
283506c3fb27SDimitry Andric /// result[135:128] := __a[135:128]
283606c3fb27SDimitry Andric /// . . .
283706c3fb27SDimitry Andric /// result[255:248] := __b[191:184]
283806c3fb27SDimitry Andric /// \endcode
283906c3fb27SDimitry Andric ///
284006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
284106c3fb27SDimitry Andric ///
284206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
284306c3fb27SDimitry Andric ///
284406c3fb27SDimitry Andric /// \param __a
284506c3fb27SDimitry Andric ///    A 256-bit integer vector used as the source for the even-numbered bytes
284606c3fb27SDimitry Andric ///    of the result.
284706c3fb27SDimitry Andric /// \param __b
284806c3fb27SDimitry Andric ///    A 256-bit integer vector used as the source for the odd-numbered bytes
284906c3fb27SDimitry Andric ///    of the result.
285006c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
28510b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
28520b57cec5SDimitry Andric _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
28530b57cec5SDimitry Andric {
28540b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
28550b57cec5SDimitry Andric }
28560b57cec5SDimitry Andric 
285706c3fb27SDimitry Andric /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
285806c3fb27SDimitry Andric ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
285906c3fb27SDimitry Andric ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
286006c3fb27SDimitry Andric ///    128-bit half of \a __a and \a __b as input; other bits in these
286106c3fb27SDimitry Andric ///    parameters are ignored.
286206c3fb27SDimitry Andric ///
286306c3fb27SDimitry Andric /// \code{.operation}
286406c3fb27SDimitry Andric /// result[15:0] := __a[15:0]
286506c3fb27SDimitry Andric /// result[31:16] := __b[15:0]
286606c3fb27SDimitry Andric /// result[47:32] := __a[31:16]
286706c3fb27SDimitry Andric /// result[63:48] := __b[31:16]
286806c3fb27SDimitry Andric /// . . .
286906c3fb27SDimitry Andric /// result[127:112] := __b[63:48]
287006c3fb27SDimitry Andric /// result[143:128] := __a[143:128]
287106c3fb27SDimitry Andric /// . . .
287206c3fb27SDimitry Andric /// result[255:239] := __b[191:176]
287306c3fb27SDimitry Andric /// \endcode
287406c3fb27SDimitry Andric ///
287506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
287606c3fb27SDimitry Andric ///
287706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
287806c3fb27SDimitry Andric ///
287906c3fb27SDimitry Andric /// \param __a
288006c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
288106c3fb27SDimitry Andric ///    elements of the result.
288206c3fb27SDimitry Andric /// \param __b
288306c3fb27SDimitry Andric ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
288406c3fb27SDimitry Andric ///    elements of the result.
288506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
28860b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
28870b57cec5SDimitry Andric _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
28880b57cec5SDimitry Andric {
28890b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
28900b57cec5SDimitry Andric }
28910b57cec5SDimitry Andric 
289206c3fb27SDimitry Andric /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
289306c3fb27SDimitry Andric ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
289406c3fb27SDimitry Andric ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
289506c3fb27SDimitry Andric ///    of \a __a and \a __b as input; other bits in these parameters are
289606c3fb27SDimitry Andric ///    ignored.
289706c3fb27SDimitry Andric ///
289806c3fb27SDimitry Andric /// \code{.operation}
289906c3fb27SDimitry Andric /// result[31:0] := __a[31:0]
290006c3fb27SDimitry Andric /// result[63:32] := __b[31:0]
290106c3fb27SDimitry Andric /// result[95:64] := __a[63:32]
290206c3fb27SDimitry Andric /// result[127:96] := __b[63:32]
290306c3fb27SDimitry Andric /// result[159:128] := __a[159:128]
290406c3fb27SDimitry Andric /// result[191:160] := __b[159:128]
290506c3fb27SDimitry Andric /// result[223:192] := __a[191:160]
290606c3fb27SDimitry Andric /// result[255:224] := __b[191:190]
290706c3fb27SDimitry Andric /// \endcode
290806c3fb27SDimitry Andric ///
290906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
291006c3fb27SDimitry Andric ///
291106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
291206c3fb27SDimitry Andric ///
291306c3fb27SDimitry Andric /// \param __a
291406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
291506c3fb27SDimitry Andric ///    elements of the result.
291606c3fb27SDimitry Andric /// \param __b
291706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
291806c3fb27SDimitry Andric ///    elements of the result.
291906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
29200b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
29210b57cec5SDimitry Andric _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
29220b57cec5SDimitry Andric {
29230b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
29240b57cec5SDimitry Andric }
29250b57cec5SDimitry Andric 
292606c3fb27SDimitry Andric /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
292706c3fb27SDimitry Andric ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
292806c3fb27SDimitry Andric ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
292906c3fb27SDimitry Andric ///    of \a __a and \a __b as input; other bits in these parameters are
293006c3fb27SDimitry Andric ///    ignored.
293106c3fb27SDimitry Andric ///
293206c3fb27SDimitry Andric /// \code{.operation}
293306c3fb27SDimitry Andric /// result[63:0] := __a[63:0]
293406c3fb27SDimitry Andric /// result[127:64] := __b[63:0]
293506c3fb27SDimitry Andric /// result[191:128] := __a[191:128]
293606c3fb27SDimitry Andric /// result[255:192] := __b[191:128]
293706c3fb27SDimitry Andric /// \endcode
293806c3fb27SDimitry Andric ///
293906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
294006c3fb27SDimitry Andric ///
294106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
294206c3fb27SDimitry Andric ///
294306c3fb27SDimitry Andric /// \param __a
294406c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
294506c3fb27SDimitry Andric ///    elements of the result.
294606c3fb27SDimitry Andric /// \param __b
294706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
294806c3fb27SDimitry Andric ///    elements of the result.
294906c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
29500b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
29510b57cec5SDimitry Andric _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
29520b57cec5SDimitry Andric {
29530b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
29540b57cec5SDimitry Andric }
29550b57cec5SDimitry Andric 
295606c3fb27SDimitry Andric /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
295706c3fb27SDimitry Andric ///    \a __b.
295806c3fb27SDimitry Andric ///
295906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
296006c3fb27SDimitry Andric ///
296106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPXOR instruction.
296206c3fb27SDimitry Andric ///
296306c3fb27SDimitry Andric /// \param __a
296406c3fb27SDimitry Andric ///    A 256-bit integer vector.
296506c3fb27SDimitry Andric /// \param __b
296606c3fb27SDimitry Andric ///    A 256-bit integer vector.
296706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
29680b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
29690b57cec5SDimitry Andric _mm256_xor_si256(__m256i __a, __m256i __b)
29700b57cec5SDimitry Andric {
29710b57cec5SDimitry Andric   return (__m256i)((__v4du)__a ^ (__v4du)__b);
29720b57cec5SDimitry Andric }
29730b57cec5SDimitry Andric 
297406c3fb27SDimitry Andric /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
297506c3fb27SDimitry Andric ///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
297606c3fb27SDimitry Andric ///   boundary.
297706c3fb27SDimitry Andric ///
297806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
297906c3fb27SDimitry Andric ///
298006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
298106c3fb27SDimitry Andric ///
298206c3fb27SDimitry Andric /// \param __V
298306c3fb27SDimitry Andric ///    A pointer to the 32-byte aligned memory containing the vector to load.
298406c3fb27SDimitry Andric /// \returns A 256-bit integer vector loaded from memory.
29850b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986*5f757f3fSDimitry Andric _mm256_stream_load_si256(const void *__V)
29870b57cec5SDimitry Andric {
29880b57cec5SDimitry Andric   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
29890b57cec5SDimitry Andric   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
29900b57cec5SDimitry Andric }
29910b57cec5SDimitry Andric 
299206c3fb27SDimitry Andric /// Broadcasts the 32-bit floating-point value from the low element of the
299306c3fb27SDimitry Andric ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
299406c3fb27SDimitry Andric ///    128-bit vector of [4 x float].
299506c3fb27SDimitry Andric ///
299606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
299706c3fb27SDimitry Andric ///
299806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
299906c3fb27SDimitry Andric ///
300006c3fb27SDimitry Andric /// \param __X
300106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
300206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
30030b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
30040b57cec5SDimitry Andric _mm_broadcastss_ps(__m128 __X)
30050b57cec5SDimitry Andric {
30060b57cec5SDimitry Andric   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
30070b57cec5SDimitry Andric }
30080b57cec5SDimitry Andric 
300906c3fb27SDimitry Andric /// Broadcasts the 64-bit floating-point value from the low element of the
301006c3fb27SDimitry Andric ///    128-bit vector of [2 x double] in \a __a to both elements of the
301106c3fb27SDimitry Andric ///    result's 128-bit vector of [2 x double].
301206c3fb27SDimitry Andric ///
301306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
301406c3fb27SDimitry Andric ///
301506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c MOVDDUP instruction.
301606c3fb27SDimitry Andric ///
301706c3fb27SDimitry Andric /// \param __a
301806c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
301906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
30200b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
30210b57cec5SDimitry Andric _mm_broadcastsd_pd(__m128d __a)
30220b57cec5SDimitry Andric {
30230b57cec5SDimitry Andric   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
30240b57cec5SDimitry Andric }
30250b57cec5SDimitry Andric 
302606c3fb27SDimitry Andric /// Broadcasts the 32-bit floating-point value from the low element of the
302706c3fb27SDimitry Andric ///    128-bit vector of [4 x float] in \a __X to all elements of the
302806c3fb27SDimitry Andric ///    result's 256-bit vector of [8 x float].
302906c3fb27SDimitry Andric ///
303006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
303106c3fb27SDimitry Andric ///
303206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
303306c3fb27SDimitry Andric ///
303406c3fb27SDimitry Andric /// \param __X
303506c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
303606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
30370b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
30380b57cec5SDimitry Andric _mm256_broadcastss_ps(__m128 __X)
30390b57cec5SDimitry Andric {
30400b57cec5SDimitry Andric   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
30410b57cec5SDimitry Andric }
30420b57cec5SDimitry Andric 
304306c3fb27SDimitry Andric /// Broadcasts the 64-bit floating-point value from the low element of the
304406c3fb27SDimitry Andric ///    128-bit vector of [2 x double] in \a __X to all elements of the
304506c3fb27SDimitry Andric ///    result's 256-bit vector of [4 x double].
304606c3fb27SDimitry Andric ///
304706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
304806c3fb27SDimitry Andric ///
304906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
305006c3fb27SDimitry Andric ///
305106c3fb27SDimitry Andric /// \param __X
305206c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
305306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
30540b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
30550b57cec5SDimitry Andric _mm256_broadcastsd_pd(__m128d __X)
30560b57cec5SDimitry Andric {
30570b57cec5SDimitry Andric   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
30580b57cec5SDimitry Andric }
30590b57cec5SDimitry Andric 
306006c3fb27SDimitry Andric /// Broadcasts the 128-bit integer data from \a __X to both the lower and
306106c3fb27SDimitry Andric ///    upper halves of the 256-bit result.
306206c3fb27SDimitry Andric ///
306306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
306406c3fb27SDimitry Andric ///
306506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
306606c3fb27SDimitry Andric ///
306706c3fb27SDimitry Andric /// \param __X
306806c3fb27SDimitry Andric ///    A 128-bit integer vector to be broadcast.
306906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
30700b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
30710b57cec5SDimitry Andric _mm256_broadcastsi128_si256(__m128i __X)
30720b57cec5SDimitry Andric {
30730b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
30740b57cec5SDimitry Andric }
30750b57cec5SDimitry Andric 
30765ffd83dbSDimitry Andric #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
30775ffd83dbSDimitry Andric 
307806c3fb27SDimitry Andric /// Merges 32-bit integer elements from either of the two 128-bit vectors of
307906c3fb27SDimitry Andric ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
308006c3fb27SDimitry Andric ///    as specified by the immediate integer operand \a M.
308106c3fb27SDimitry Andric ///
308206c3fb27SDimitry Andric /// \code{.operation}
308306c3fb27SDimitry Andric /// FOR i := 0 TO 3
308406c3fb27SDimitry Andric ///   j := i*32
308506c3fb27SDimitry Andric ///   IF M[i] == 0
308606c3fb27SDimitry Andric ///     result[31+j:j] := V1[31+j:j]
308706c3fb27SDimitry Andric ///   ELSE
308806c3fb27SDimitry Andric ///     result[31+j:j] := V2[32+j:j]
308906c3fb27SDimitry Andric ///   FI
309006c3fb27SDimitry Andric /// ENDFOR
309106c3fb27SDimitry Andric /// \endcode
309206c3fb27SDimitry Andric ///
309306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
309406c3fb27SDimitry Andric ///
309506c3fb27SDimitry Andric /// \code
309606c3fb27SDimitry Andric /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
309706c3fb27SDimitry Andric /// \endcode
309806c3fb27SDimitry Andric ///
309906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDDD instruction.
310006c3fb27SDimitry Andric ///
310106c3fb27SDimitry Andric /// \param V1
310206c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing source values.
310306c3fb27SDimitry Andric /// \param V2
310406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing source values.
310506c3fb27SDimitry Andric /// \param M
310606c3fb27SDimitry Andric ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
310706c3fb27SDimitry Andric ///    source for each element of the result. The position of the mask bit
310806c3fb27SDimitry Andric ///    corresponds to the index of a copied value. When a mask bit is 0, the
310906c3fb27SDimitry Andric ///    element is copied from \a V1; otherwise, it is copied from \a V2.
311006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result.
31110b57cec5SDimitry Andric #define _mm_blend_epi32(V1, V2, M) \
3112349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113349cc55cSDimitry Andric                                       (__v4si)(__m128i)(V2), (int)(M)))
31140b57cec5SDimitry Andric 
311506c3fb27SDimitry Andric /// Merges 32-bit integer elements from either of the two 256-bit vectors of
311606c3fb27SDimitry Andric ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
311706c3fb27SDimitry Andric ///    as specified by the immediate integer operand \a M.
311806c3fb27SDimitry Andric ///
311906c3fb27SDimitry Andric /// \code{.operation}
312006c3fb27SDimitry Andric /// FOR i := 0 TO 7
312106c3fb27SDimitry Andric ///   j := i*32
312206c3fb27SDimitry Andric ///   IF M[i] == 0
312306c3fb27SDimitry Andric ///     result[31+j:j] := V1[31+j:j]
312406c3fb27SDimitry Andric ///   ELSE
312506c3fb27SDimitry Andric ///     result[31+j:j] := V2[32+j:j]
312606c3fb27SDimitry Andric ///   FI
312706c3fb27SDimitry Andric /// ENDFOR
312806c3fb27SDimitry Andric /// \endcode
312906c3fb27SDimitry Andric ///
313006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
313106c3fb27SDimitry Andric ///
313206c3fb27SDimitry Andric /// \code
313306c3fb27SDimitry Andric /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
313406c3fb27SDimitry Andric /// \endcode
313506c3fb27SDimitry Andric ///
313606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDDD instruction.
313706c3fb27SDimitry Andric ///
313806c3fb27SDimitry Andric /// \param V1
313906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing source values.
314006c3fb27SDimitry Andric /// \param V2
314106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing source values.
314206c3fb27SDimitry Andric /// \param M
314306c3fb27SDimitry Andric ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
314406c3fb27SDimitry Andric ///    source for each element of the result. The position of the mask bit
314506c3fb27SDimitry Andric ///    corresponds to the index of a copied value. When a mask bit is 0, the
314606c3fb27SDimitry Andric ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
314706c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
31480b57cec5SDimitry Andric #define _mm256_blend_epi32(V1, V2, M) \
3149349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150349cc55cSDimitry Andric                                       (__v8si)(__m256i)(V2), (int)(M)))
31510b57cec5SDimitry Andric 
315206c3fb27SDimitry Andric /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
315306c3fb27SDimitry Andric ///    bytes of the 256-bit result.
315406c3fb27SDimitry Andric ///
315506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
315606c3fb27SDimitry Andric ///
315706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
315806c3fb27SDimitry Andric ///
315906c3fb27SDimitry Andric /// \param __X
316006c3fb27SDimitry Andric ///    A 128-bit integer vector whose low byte will be broadcast.
316106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
31620b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
31630b57cec5SDimitry Andric _mm256_broadcastb_epi8(__m128i __X)
31640b57cec5SDimitry Andric {
31650b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31660b57cec5SDimitry Andric }
31670b57cec5SDimitry Andric 
316806c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
316906c3fb27SDimitry Andric ///    to all elements of the result's 256-bit vector of [16 x i16].
317006c3fb27SDimitry Andric ///
317106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
317206c3fb27SDimitry Andric ///
317306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
317406c3fb27SDimitry Andric ///
317506c3fb27SDimitry Andric /// \param __X
317606c3fb27SDimitry Andric ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
317706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result.
31780b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
31790b57cec5SDimitry Andric _mm256_broadcastw_epi16(__m128i __X)
31800b57cec5SDimitry Andric {
31810b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31820b57cec5SDimitry Andric }
31830b57cec5SDimitry Andric 
318406c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
318506c3fb27SDimitry Andric ///    to all elements of the result's 256-bit vector of [8 x i32].
318606c3fb27SDimitry Andric ///
318706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
318806c3fb27SDimitry Andric ///
318906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
319006c3fb27SDimitry Andric ///
319106c3fb27SDimitry Andric /// \param __X
319206c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
319306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
31940b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
31950b57cec5SDimitry Andric _mm256_broadcastd_epi32(__m128i __X)
31960b57cec5SDimitry Andric {
31970b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
31980b57cec5SDimitry Andric }
31990b57cec5SDimitry Andric 
320006c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
320106c3fb27SDimitry Andric ///    to all elements of the result's 256-bit vector of [4 x i64].
320206c3fb27SDimitry Andric ///
320306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
320406c3fb27SDimitry Andric ///
320506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
320606c3fb27SDimitry Andric ///
320706c3fb27SDimitry Andric /// \param __X
320806c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
320906c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
32100b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
32110b57cec5SDimitry Andric _mm256_broadcastq_epi64(__m128i __X)
32120b57cec5SDimitry Andric {
32130b57cec5SDimitry Andric   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
32140b57cec5SDimitry Andric }
32150b57cec5SDimitry Andric 
321606c3fb27SDimitry Andric /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
321706c3fb27SDimitry Andric ///    bytes of the 128-bit result.
321806c3fb27SDimitry Andric ///
321906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
322006c3fb27SDimitry Andric ///
322106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
322206c3fb27SDimitry Andric ///
322306c3fb27SDimitry Andric /// \param __X
322406c3fb27SDimitry Andric ///    A 128-bit integer vector whose low byte will be broadcast.
322506c3fb27SDimitry Andric /// \returns A 128-bit integer vector containing the result.
32260b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
32270b57cec5SDimitry Andric _mm_broadcastb_epi8(__m128i __X)
32280b57cec5SDimitry Andric {
32290b57cec5SDimitry Andric   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
32300b57cec5SDimitry Andric }
32310b57cec5SDimitry Andric 
323206c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
323306c3fb27SDimitry Andric ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
323406c3fb27SDimitry Andric ///
323506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
323606c3fb27SDimitry Andric ///
323706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
323806c3fb27SDimitry Andric ///
323906c3fb27SDimitry Andric /// \param __X
324006c3fb27SDimitry Andric ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
324106c3fb27SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the result.
32420b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
32430b57cec5SDimitry Andric _mm_broadcastw_epi16(__m128i __X)
32440b57cec5SDimitry Andric {
32450b57cec5SDimitry Andric   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
32460b57cec5SDimitry Andric }
32470b57cec5SDimitry Andric 
324806c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
324906c3fb27SDimitry Andric ///    to all elements of the result's vector of [4 x i32].
325006c3fb27SDimitry Andric ///
325106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
325206c3fb27SDimitry Andric ///
325306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
325406c3fb27SDimitry Andric ///
325506c3fb27SDimitry Andric /// \param __X
325606c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
325706c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result.
32580b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
32590b57cec5SDimitry Andric _mm_broadcastd_epi32(__m128i __X)
32600b57cec5SDimitry Andric {
32610b57cec5SDimitry Andric   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
32620b57cec5SDimitry Andric }
32630b57cec5SDimitry Andric 
326406c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
326506c3fb27SDimitry Andric ///    to both elements of the result's 128-bit vector of [2 x i64].
326606c3fb27SDimitry Andric ///
326706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
326806c3fb27SDimitry Andric ///
326906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
327006c3fb27SDimitry Andric ///
327106c3fb27SDimitry Andric /// \param __X
327206c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
327306c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the result.
32740b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
32750b57cec5SDimitry Andric _mm_broadcastq_epi64(__m128i __X)
32760b57cec5SDimitry Andric {
32770b57cec5SDimitry Andric   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
32780b57cec5SDimitry Andric }
32790b57cec5SDimitry Andric 
328006c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
328106c3fb27SDimitry Andric ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
328206c3fb27SDimitry Andric ///    elements of the 256-bit vector of [8 x i32] in \a __b.
328306c3fb27SDimitry Andric ///
328406c3fb27SDimitry Andric /// \code{.operation}
328506c3fb27SDimitry Andric /// FOR i := 0 TO 7
328606c3fb27SDimitry Andric ///   j := i*32
328706c3fb27SDimitry Andric ///   k := __b[j+2:j] * 32
328806c3fb27SDimitry Andric ///   result[j+31:j] := __a[k+31:k]
328906c3fb27SDimitry Andric /// ENDFOR
329006c3fb27SDimitry Andric /// \endcode
329106c3fb27SDimitry Andric ///
329206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
329306c3fb27SDimitry Andric ///
329406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMD instruction.
329506c3fb27SDimitry Andric ///
329606c3fb27SDimitry Andric /// \param __a
329706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the source values.
329806c3fb27SDimitry Andric /// \param __b
329906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
330006c3fb27SDimitry Andric ///    \a __a.
330106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
33020b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
33030b57cec5SDimitry Andric _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
33040b57cec5SDimitry Andric {
33050b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
33060b57cec5SDimitry Andric }
33070b57cec5SDimitry Andric 
330806c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
330906c3fb27SDimitry Andric ///    the 256-bit vector of [4 x double] in \a V as specified by the
331006c3fb27SDimitry Andric ///    immediate value \a M.
331106c3fb27SDimitry Andric ///
331206c3fb27SDimitry Andric /// \code{.operation}
331306c3fb27SDimitry Andric /// FOR i := 0 TO 3
331406c3fb27SDimitry Andric ///   j := i*64
331506c3fb27SDimitry Andric ///   k := (M >> i*2)[1:0] * 64
331606c3fb27SDimitry Andric ///   result[j+63:j] := V[k+63:k]
331706c3fb27SDimitry Andric /// ENDFOR
331806c3fb27SDimitry Andric /// \endcode
331906c3fb27SDimitry Andric ///
332006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
332106c3fb27SDimitry Andric ///
332206c3fb27SDimitry Andric /// \code
332306c3fb27SDimitry Andric /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
332406c3fb27SDimitry Andric /// \endcode
332506c3fb27SDimitry Andric ///
332606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMPD instruction.
332706c3fb27SDimitry Andric ///
332806c3fb27SDimitry Andric /// \param V
332906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the source values.
333006c3fb27SDimitry Andric /// \param M
333106c3fb27SDimitry Andric ///    An immediate 8-bit value specifying which elements to copy from \a V.
333206c3fb27SDimitry Andric ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
333306c3fb27SDimitry Andric ///    \a M[3:2] specifies the index for element 1, and so forth.
333406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
33350b57cec5SDimitry Andric #define _mm256_permute4x64_pd(V, M) \
3336349cc55cSDimitry Andric   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
33370b57cec5SDimitry Andric 
333806c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
333906c3fb27SDimitry Andric ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
334006c3fb27SDimitry Andric ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
334106c3fb27SDimitry Andric ///
334206c3fb27SDimitry Andric /// \code{.operation}
334306c3fb27SDimitry Andric /// FOR i := 0 TO 7
334406c3fb27SDimitry Andric ///   j := i*32
334506c3fb27SDimitry Andric ///   k := __b[j+2:j] * 32
334606c3fb27SDimitry Andric ///   result[j+31:j] := __a[k+31:k]
334706c3fb27SDimitry Andric /// ENDFOR
334806c3fb27SDimitry Andric /// \endcode
334906c3fb27SDimitry Andric ///
335006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
335106c3fb27SDimitry Andric ///
335206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMPS instruction.
335306c3fb27SDimitry Andric ///
335406c3fb27SDimitry Andric /// \param __a
335506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the source values.
335606c3fb27SDimitry Andric /// \param __b
335706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
335806c3fb27SDimitry Andric ///    \a __a.
335906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
33600b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
33610b57cec5SDimitry Andric _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
33620b57cec5SDimitry Andric {
33630b57cec5SDimitry Andric   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
33640b57cec5SDimitry Andric }
33650b57cec5SDimitry Andric 
336606c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
336706c3fb27SDimitry Andric ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
336806c3fb27SDimitry Andric ///    immediate value \a M.
336906c3fb27SDimitry Andric ///
337006c3fb27SDimitry Andric /// \code{.operation}
337106c3fb27SDimitry Andric /// FOR i := 0 TO 3
337206c3fb27SDimitry Andric ///   j := i*64
337306c3fb27SDimitry Andric ///   k := (M >> i*2)[1:0] * 64
337406c3fb27SDimitry Andric ///   result[j+63:j] := V[k+63:k]
337506c3fb27SDimitry Andric /// ENDFOR
337606c3fb27SDimitry Andric /// \endcode
337706c3fb27SDimitry Andric ///
337806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
337906c3fb27SDimitry Andric ///
338006c3fb27SDimitry Andric /// \code
338106c3fb27SDimitry Andric /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
338206c3fb27SDimitry Andric /// \endcode
338306c3fb27SDimitry Andric ///
338406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMQ instruction.
338506c3fb27SDimitry Andric ///
338606c3fb27SDimitry Andric /// \param V
338706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the source values.
338806c3fb27SDimitry Andric /// \param M
338906c3fb27SDimitry Andric ///    An immediate 8-bit value specifying which elements to copy from \a V.
339006c3fb27SDimitry Andric ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
339106c3fb27SDimitry Andric ///    \a M[3:2] specifies the index for element 1, and so forth.
339206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
33930b57cec5SDimitry Andric #define _mm256_permute4x64_epi64(V, M) \
3394349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
33950b57cec5SDimitry Andric 
339606c3fb27SDimitry Andric /// Sets each half of the 256-bit result either to zero or to one of the
339706c3fb27SDimitry Andric ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
339806c3fb27SDimitry Andric ///    as specified by the immediate value \a M.
339906c3fb27SDimitry Andric ///
340006c3fb27SDimitry Andric /// \code{.operation}
340106c3fb27SDimitry Andric /// FOR i := 0 TO 1
340206c3fb27SDimitry Andric ///   j := i*128
340306c3fb27SDimitry Andric ///   k := M >> (i*4)
340406c3fb27SDimitry Andric ///   IF k[3] == 0
340506c3fb27SDimitry Andric ///     CASE (k[1:0]) OF
340606c3fb27SDimitry Andric ///     0: result[127+j:j] := V1[127:0]
340706c3fb27SDimitry Andric ///     1: result[127+j:j] := V1[255:128]
340806c3fb27SDimitry Andric ///     2: result[127+j:j] := V2[127:0]
340906c3fb27SDimitry Andric ///     3: result[127+j:j] := V2[255:128]
341006c3fb27SDimitry Andric ///     ESAC
341106c3fb27SDimitry Andric ///   ELSE
341206c3fb27SDimitry Andric ///     result[127+j:j] := 0
341306c3fb27SDimitry Andric ///   FI
341406c3fb27SDimitry Andric /// ENDFOR
341506c3fb27SDimitry Andric /// \endcode
341606c3fb27SDimitry Andric ///
341706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
341806c3fb27SDimitry Andric ///
341906c3fb27SDimitry Andric /// \code
342006c3fb27SDimitry Andric /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
342106c3fb27SDimitry Andric /// \endcode
342206c3fb27SDimitry Andric ///
342306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERM2I128 instruction.
342406c3fb27SDimitry Andric ///
342506c3fb27SDimitry Andric /// \param V1
342606c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
342706c3fb27SDimitry Andric /// \param V2
342806c3fb27SDimitry Andric ///    A 256-bit integer vector containing source values.
342906c3fb27SDimitry Andric /// \param M
343006c3fb27SDimitry Andric ///    An immediate value specifying how to form the result. Bits [3:0]
343106c3fb27SDimitry Andric ///    control the lower half of the result, bits [7:4] control the upper half.
343206c3fb27SDimitry Andric ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
343306c3fb27SDimitry Andric ///    otherwise bits [1:0] determine the source as follows. \n
343406c3fb27SDimitry Andric ///    0: the lower half of \a V1 \n
343506c3fb27SDimitry Andric ///    1: the upper half of \a V1 \n
343606c3fb27SDimitry Andric ///    2: the lower half of \a V2 \n
343706c3fb27SDimitry Andric ///    3: the upper half of \a V2
343806c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
34390b57cec5SDimitry Andric #define _mm256_permute2x128_si256(V1, V2, M) \
3440349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
34410b57cec5SDimitry Andric 
344206c3fb27SDimitry Andric /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
344306c3fb27SDimitry Andric ///     of the immediate \a M is zero, extracts the lower half of the result;
344406c3fb27SDimitry Andric ///     otherwise, extracts the upper half.
344506c3fb27SDimitry Andric ///
344606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
344706c3fb27SDimitry Andric ///
344806c3fb27SDimitry Andric /// \code
344906c3fb27SDimitry Andric /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
345006c3fb27SDimitry Andric /// \endcode
345106c3fb27SDimitry Andric ///
345206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
345306c3fb27SDimitry Andric ///
345406c3fb27SDimitry Andric /// \param V
345506c3fb27SDimitry Andric ///    A 256-bit integer vector containing the source values.
345606c3fb27SDimitry Andric /// \param M
345706c3fb27SDimitry Andric ///    An immediate value specifying which half of \a V to extract.
345806c3fb27SDimitry Andric /// \returns A 128-bit integer vector containing the result.
34590b57cec5SDimitry Andric #define _mm256_extracti128_si256(V, M) \
3460349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
34610b57cec5SDimitry Andric 
346206c3fb27SDimitry Andric /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
346306c3fb27SDimitry Andric ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
346406c3fb27SDimitry Andric ///     is zero, overwrites the lower half of the result; otherwise,
346506c3fb27SDimitry Andric ///     overwrites the upper half.
346606c3fb27SDimitry Andric ///
346706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
346806c3fb27SDimitry Andric ///
346906c3fb27SDimitry Andric /// \code
347006c3fb27SDimitry Andric /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
347106c3fb27SDimitry Andric /// \endcode
347206c3fb27SDimitry Andric ///
347306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VINSERTI128 instruction.
347406c3fb27SDimitry Andric ///
347506c3fb27SDimitry Andric /// \param V1
347606c3fb27SDimitry Andric ///    A 256-bit integer vector containing a source value.
347706c3fb27SDimitry Andric /// \param V2
347806c3fb27SDimitry Andric ///    A 128-bit integer vector containing a source value.
347906c3fb27SDimitry Andric /// \param M
348006c3fb27SDimitry Andric ///    An immediate value specifying where to put \a V2 in the result.
348106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result.
34820b57cec5SDimitry Andric #define _mm256_inserti128_si256(V1, V2, M) \
3483349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484349cc55cSDimitry Andric                                          (__v2di)(__m128i)(V2), (int)(M)))
34850b57cec5SDimitry Andric 
348606c3fb27SDimitry Andric /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
348706c3fb27SDimitry Andric ///    the most significant bit of the corresponding element in the mask
348806c3fb27SDimitry Andric ///    \a __M is set; otherwise, sets that element of the result to zero.
348906c3fb27SDimitry Andric ///    Returns the 256-bit [8 x i32] result.
349006c3fb27SDimitry Andric ///
349106c3fb27SDimitry Andric /// \code{.operation}
349206c3fb27SDimitry Andric /// FOR i := 0 TO 7
349306c3fb27SDimitry Andric ///   j := i*32
349406c3fb27SDimitry Andric ///   IF __M[j+31] == 1
349506c3fb27SDimitry Andric ///     result[j+31:j] := Load32(__X+(i*4))
349606c3fb27SDimitry Andric ///   ELSE
349706c3fb27SDimitry Andric ///     result[j+31:j] := 0
349806c3fb27SDimitry Andric ///   FI
349906c3fb27SDimitry Andric /// ENDFOR
350006c3fb27SDimitry Andric /// \endcode
350106c3fb27SDimitry Andric ///
350206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
350306c3fb27SDimitry Andric ///
350406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
350506c3fb27SDimitry Andric ///
350606c3fb27SDimitry Andric /// \param __X
350706c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
350806c3fb27SDimitry Andric /// \param __M
350906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the mask bits.
351006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
351106c3fb27SDimitry Andric ///    elements.
35120b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
35130b57cec5SDimitry Andric _mm256_maskload_epi32(int const *__X, __m256i __M)
35140b57cec5SDimitry Andric {
35150b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
35160b57cec5SDimitry Andric }
35170b57cec5SDimitry Andric 
351806c3fb27SDimitry Andric /// Conditionally loads four 64-bit integer elements from memory \a __X, if
351906c3fb27SDimitry Andric ///    the most significant bit of the corresponding element in the mask
352006c3fb27SDimitry Andric ///    \a __M is set; otherwise, sets that element of the result to zero.
352106c3fb27SDimitry Andric ///    Returns the 256-bit [4 x i64] result.
352206c3fb27SDimitry Andric ///
352306c3fb27SDimitry Andric /// \code{.operation}
352406c3fb27SDimitry Andric /// FOR i := 0 TO 3
352506c3fb27SDimitry Andric ///   j := i*64
352606c3fb27SDimitry Andric ///   IF __M[j+63] == 1
352706c3fb27SDimitry Andric ///     result[j+63:j] := Load64(__X+(i*8))
352806c3fb27SDimitry Andric ///   ELSE
352906c3fb27SDimitry Andric ///     result[j+63:j] := 0
353006c3fb27SDimitry Andric ///   FI
353106c3fb27SDimitry Andric /// ENDFOR
353206c3fb27SDimitry Andric /// \endcode
353306c3fb27SDimitry Andric ///
353406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
353506c3fb27SDimitry Andric ///
353606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
353706c3fb27SDimitry Andric ///
353806c3fb27SDimitry Andric /// \param __X
353906c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
354006c3fb27SDimitry Andric /// \param __M
354106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the mask bits.
354206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
354306c3fb27SDimitry Andric ///    elements.
35440b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
35450b57cec5SDimitry Andric _mm256_maskload_epi64(long long const *__X, __m256i __M)
35460b57cec5SDimitry Andric {
35470b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
35480b57cec5SDimitry Andric }
35490b57cec5SDimitry Andric 
355006c3fb27SDimitry Andric /// Conditionally loads four 32-bit integer elements from memory \a __X, if
355106c3fb27SDimitry Andric ///    the most significant bit of the corresponding element in the mask
355206c3fb27SDimitry Andric ///    \a __M is set; otherwise, sets that element of the result to zero.
355306c3fb27SDimitry Andric ///    Returns the 128-bit [4 x i32] result.
355406c3fb27SDimitry Andric ///
355506c3fb27SDimitry Andric /// \code{.operation}
355606c3fb27SDimitry Andric /// FOR i := 0 TO 3
355706c3fb27SDimitry Andric ///   j := i*32
355806c3fb27SDimitry Andric ///   IF __M[j+31] == 1
355906c3fb27SDimitry Andric ///     result[j+31:j] := Load32(__X+(i*4))
356006c3fb27SDimitry Andric ///   ELSE
356106c3fb27SDimitry Andric ///     result[j+31:j] := 0
356206c3fb27SDimitry Andric ///   FI
356306c3fb27SDimitry Andric /// ENDFOR
356406c3fb27SDimitry Andric /// \endcode
356506c3fb27SDimitry Andric ///
356606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
356706c3fb27SDimitry Andric ///
356806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
356906c3fb27SDimitry Andric ///
357006c3fb27SDimitry Andric /// \param __X
357106c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
357206c3fb27SDimitry Andric /// \param __M
357306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the mask bits.
357406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
357506c3fb27SDimitry Andric ///    elements.
35760b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
35770b57cec5SDimitry Andric _mm_maskload_epi32(int const *__X, __m128i __M)
35780b57cec5SDimitry Andric {
35790b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
35800b57cec5SDimitry Andric }
35810b57cec5SDimitry Andric 
358206c3fb27SDimitry Andric /// Conditionally loads two 64-bit integer elements from memory \a __X, if
358306c3fb27SDimitry Andric ///    the most significant bit of the corresponding element in the mask
358406c3fb27SDimitry Andric ///    \a __M is set; otherwise, sets that element of the result to zero.
358506c3fb27SDimitry Andric ///    Returns the 128-bit [2 x i64] result.
358606c3fb27SDimitry Andric ///
358706c3fb27SDimitry Andric /// \code{.operation}
358806c3fb27SDimitry Andric /// FOR i := 0 TO 1
358906c3fb27SDimitry Andric ///   j := i*64
359006c3fb27SDimitry Andric ///   IF __M[j+63] == 1
359106c3fb27SDimitry Andric ///     result[j+63:j] := Load64(__X+(i*8))
359206c3fb27SDimitry Andric ///   ELSE
359306c3fb27SDimitry Andric ///     result[j+63:j] := 0
359406c3fb27SDimitry Andric ///   FI
359506c3fb27SDimitry Andric /// ENDFOR
359606c3fb27SDimitry Andric /// \endcode
359706c3fb27SDimitry Andric ///
359806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
359906c3fb27SDimitry Andric ///
360006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
360106c3fb27SDimitry Andric ///
360206c3fb27SDimitry Andric /// \param __X
360306c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
360406c3fb27SDimitry Andric /// \param __M
360506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the mask bits.
360606c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
360706c3fb27SDimitry Andric ///    elements.
36080b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
36090b57cec5SDimitry Andric _mm_maskload_epi64(long long const *__X, __m128i __M)
36100b57cec5SDimitry Andric {
36110b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
36120b57cec5SDimitry Andric }
36130b57cec5SDimitry Andric 
361406c3fb27SDimitry Andric /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
361506c3fb27SDimitry Andric ///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
361606c3fb27SDimitry Andric ///    the corresponding element in the mask \a __M is set; otherwise, the
361706c3fb27SDimitry Andric ///    memory element is unchanged.
361806c3fb27SDimitry Andric ///
361906c3fb27SDimitry Andric /// \code{.operation}
362006c3fb27SDimitry Andric /// FOR i := 0 TO 7
362106c3fb27SDimitry Andric ///   j := i*32
362206c3fb27SDimitry Andric ///   IF __M[j+31] == 1
362306c3fb27SDimitry Andric ///     Store32(__X+(i*4), __Y[j+31:j])
362406c3fb27SDimitry Andric ///   FI
362506c3fb27SDimitry Andric /// ENDFOR
362606c3fb27SDimitry Andric /// \endcode
362706c3fb27SDimitry Andric ///
362806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
362906c3fb27SDimitry Andric ///
363006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
363106c3fb27SDimitry Andric ///
363206c3fb27SDimitry Andric /// \param __X
363306c3fb27SDimitry Andric ///    A pointer to the memory used for storing values.
363406c3fb27SDimitry Andric /// \param __M
363506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the mask bits.
363606c3fb27SDimitry Andric /// \param __Y
363706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the values to store.
36380b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS256
36390b57cec5SDimitry Andric _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
36400b57cec5SDimitry Andric {
36410b57cec5SDimitry Andric   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
36420b57cec5SDimitry Andric }
36430b57cec5SDimitry Andric 
364406c3fb27SDimitry Andric /// Conditionally stores four 64-bit integer elements from the 256-bit vector
364506c3fb27SDimitry Andric ///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
364606c3fb27SDimitry Andric ///    the corresponding element in the mask \a __M is set; otherwise, the
364706c3fb27SDimitry Andric ///    memory element is unchanged.
364806c3fb27SDimitry Andric ///
364906c3fb27SDimitry Andric /// \code{.operation}
365006c3fb27SDimitry Andric /// FOR i := 0 TO 3
365106c3fb27SDimitry Andric ///   j := i*64
365206c3fb27SDimitry Andric ///   IF __M[j+63] == 1
365306c3fb27SDimitry Andric ///     Store64(__X+(i*8), __Y[j+63:j])
365406c3fb27SDimitry Andric ///   FI
365506c3fb27SDimitry Andric /// ENDFOR
365606c3fb27SDimitry Andric /// \endcode
365706c3fb27SDimitry Andric ///
365806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
365906c3fb27SDimitry Andric ///
366006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
366106c3fb27SDimitry Andric ///
366206c3fb27SDimitry Andric /// \param __X
366306c3fb27SDimitry Andric ///    A pointer to the memory used for storing values.
366406c3fb27SDimitry Andric /// \param __M
366506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the mask bits.
366606c3fb27SDimitry Andric /// \param __Y
366706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the values to store.
36680b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS256
36690b57cec5SDimitry Andric _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
36700b57cec5SDimitry Andric {
36710b57cec5SDimitry Andric   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
36720b57cec5SDimitry Andric }
36730b57cec5SDimitry Andric 
367406c3fb27SDimitry Andric /// Conditionally stores four 32-bit integer elements from the 128-bit vector
367506c3fb27SDimitry Andric ///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
367606c3fb27SDimitry Andric ///    the corresponding element in the mask \a __M is set; otherwise, the
367706c3fb27SDimitry Andric ///    memory element is unchanged.
367806c3fb27SDimitry Andric ///
367906c3fb27SDimitry Andric /// \code{.operation}
368006c3fb27SDimitry Andric /// FOR i := 0 TO 3
368106c3fb27SDimitry Andric ///   j := i*32
368206c3fb27SDimitry Andric ///   IF __M[j+31] == 1
368306c3fb27SDimitry Andric ///     Store32(__X+(i*4), __Y[j+31:j])
368406c3fb27SDimitry Andric ///   FI
368506c3fb27SDimitry Andric /// ENDFOR
368606c3fb27SDimitry Andric /// \endcode
368706c3fb27SDimitry Andric ///
368806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
368906c3fb27SDimitry Andric ///
369006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
369106c3fb27SDimitry Andric ///
369206c3fb27SDimitry Andric /// \param __X
369306c3fb27SDimitry Andric ///    A pointer to the memory used for storing values.
369406c3fb27SDimitry Andric /// \param __M
369506c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the mask bits.
369606c3fb27SDimitry Andric /// \param __Y
369706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the values to store.
36980b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS128
36990b57cec5SDimitry Andric _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
37000b57cec5SDimitry Andric {
37010b57cec5SDimitry Andric   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
37020b57cec5SDimitry Andric }
37030b57cec5SDimitry Andric 
370406c3fb27SDimitry Andric /// Conditionally stores two 64-bit integer elements from the 128-bit vector
370506c3fb27SDimitry Andric ///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
370606c3fb27SDimitry Andric ///    the corresponding element in the mask \a __M is set; otherwise, the
370706c3fb27SDimitry Andric ///    memory element is unchanged.
370806c3fb27SDimitry Andric ///
370906c3fb27SDimitry Andric /// \code{.operation}
371006c3fb27SDimitry Andric /// FOR i := 0 TO 1
371106c3fb27SDimitry Andric ///   j := i*64
371206c3fb27SDimitry Andric ///   IF __M[j+63] == 1
371306c3fb27SDimitry Andric ///     Store64(__X+(i*8), __Y[j+63:j])
371406c3fb27SDimitry Andric ///   FI
371506c3fb27SDimitry Andric /// ENDFOR
371606c3fb27SDimitry Andric /// \endcode
371706c3fb27SDimitry Andric ///
371806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
371906c3fb27SDimitry Andric ///
372006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
372106c3fb27SDimitry Andric ///
372206c3fb27SDimitry Andric /// \param __X
372306c3fb27SDimitry Andric ///    A pointer to the memory used for storing values.
372406c3fb27SDimitry Andric /// \param __M
372506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the mask bits.
372606c3fb27SDimitry Andric /// \param __Y
372706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the values to store.
37280b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS128
37290b57cec5SDimitry Andric _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
37300b57cec5SDimitry Andric {
37310b57cec5SDimitry Andric   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
37320b57cec5SDimitry Andric }
37330b57cec5SDimitry Andric 
373406c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
373506c3fb27SDimitry Andric ///    left by the number of bits given in the corresponding element of the
373606c3fb27SDimitry Andric ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
373706c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
373806c3fb27SDimitry Andric ///    31, the result for that element is zero.
373906c3fb27SDimitry Andric ///
374006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
374106c3fb27SDimitry Andric ///
374206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVD instruction.
374306c3fb27SDimitry Andric ///
374406c3fb27SDimitry Andric /// \param __X
374506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
374606c3fb27SDimitry Andric /// \param __Y
374706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
374806c3fb27SDimitry Andric ///    bits).
374906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
37500b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
37510b57cec5SDimitry Andric _mm256_sllv_epi32(__m256i __X, __m256i __Y)
37520b57cec5SDimitry Andric {
37530b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
37540b57cec5SDimitry Andric }
37550b57cec5SDimitry Andric 
375606c3fb27SDimitry Andric /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
375706c3fb27SDimitry Andric ///    left by the number of bits given in the corresponding element of the
375806c3fb27SDimitry Andric ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
375906c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
376006c3fb27SDimitry Andric ///    31, the result for that element is zero.
376106c3fb27SDimitry Andric ///
376206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
376306c3fb27SDimitry Andric ///
376406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVD instruction.
376506c3fb27SDimitry Andric ///
376606c3fb27SDimitry Andric /// \param __X
376706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] to be shifted.
376806c3fb27SDimitry Andric /// \param __Y
376906c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
377006c3fb27SDimitry Andric ///    bits).
377106c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result.
37720b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
37730b57cec5SDimitry Andric _mm_sllv_epi32(__m128i __X, __m128i __Y)
37740b57cec5SDimitry Andric {
37750b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
37760b57cec5SDimitry Andric }
37770b57cec5SDimitry Andric 
377806c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
377906c3fb27SDimitry Andric ///    left by the number of bits given in the corresponding element of the
378006c3fb27SDimitry Andric ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
378106c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
378206c3fb27SDimitry Andric ///    63, the result for that element is zero.
378306c3fb27SDimitry Andric ///
378406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
378506c3fb27SDimitry Andric ///
378606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVQ instruction.
378706c3fb27SDimitry Andric ///
378806c3fb27SDimitry Andric /// \param __X
378906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] to be shifted.
379006c3fb27SDimitry Andric /// \param __Y
379106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
379206c3fb27SDimitry Andric ///    bits).
379306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
37940b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
37950b57cec5SDimitry Andric _mm256_sllv_epi64(__m256i __X, __m256i __Y)
37960b57cec5SDimitry Andric {
37970b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
37980b57cec5SDimitry Andric }
37990b57cec5SDimitry Andric 
380006c3fb27SDimitry Andric /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
380106c3fb27SDimitry Andric ///    left by the number of bits given in the corresponding element of the
380206c3fb27SDimitry Andric ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
380306c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
380406c3fb27SDimitry Andric ///    63, the result for that element is zero.
380506c3fb27SDimitry Andric ///
380606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
380706c3fb27SDimitry Andric ///
380806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVQ instruction.
380906c3fb27SDimitry Andric ///
381006c3fb27SDimitry Andric /// \param __X
381106c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] to be shifted.
381206c3fb27SDimitry Andric /// \param __Y
381306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
381406c3fb27SDimitry Andric ///    bits).
381506c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the result.
38160b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
38170b57cec5SDimitry Andric _mm_sllv_epi64(__m128i __X, __m128i __Y)
38180b57cec5SDimitry Andric {
38190b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
38200b57cec5SDimitry Andric }
38210b57cec5SDimitry Andric 
382206c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
382306c3fb27SDimitry Andric ///    right by the number of bits given in the corresponding element of the
382406c3fb27SDimitry Andric ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
382506c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
382606c3fb27SDimitry Andric ///    31, the result for that element is 0 or -1 according to the sign bit
382706c3fb27SDimitry Andric ///    for that element.
382806c3fb27SDimitry Andric ///
382906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
383006c3fb27SDimitry Andric ///
383106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAVD instruction.
383206c3fb27SDimitry Andric ///
383306c3fb27SDimitry Andric /// \param __X
383406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
383506c3fb27SDimitry Andric /// \param __Y
383606c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
383706c3fb27SDimitry Andric ///    bits).
383806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
38390b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
38400b57cec5SDimitry Andric _mm256_srav_epi32(__m256i __X, __m256i __Y)
38410b57cec5SDimitry Andric {
38420b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
38430b57cec5SDimitry Andric }
38440b57cec5SDimitry Andric 
384506c3fb27SDimitry Andric /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
384606c3fb27SDimitry Andric ///    right by the number of bits given in the corresponding element of the
384706c3fb27SDimitry Andric ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
384806c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
384906c3fb27SDimitry Andric ///    31, the result for that element is 0 or -1 according to the sign bit
385006c3fb27SDimitry Andric ///    for that element.
385106c3fb27SDimitry Andric ///
385206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
385306c3fb27SDimitry Andric ///
385406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAVD instruction.
385506c3fb27SDimitry Andric ///
385606c3fb27SDimitry Andric /// \param __X
385706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] to be shifted.
385806c3fb27SDimitry Andric /// \param __Y
385906c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
386006c3fb27SDimitry Andric ///    bits).
386106c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result.
38620b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
38630b57cec5SDimitry Andric _mm_srav_epi32(__m128i __X, __m128i __Y)
38640b57cec5SDimitry Andric {
38650b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
38660b57cec5SDimitry Andric }
38670b57cec5SDimitry Andric 
386806c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
386906c3fb27SDimitry Andric ///    right by the number of bits given in the corresponding element of the
387006c3fb27SDimitry Andric ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
387106c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
387206c3fb27SDimitry Andric ///    31, the result for that element is zero.
387306c3fb27SDimitry Andric ///
387406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
387506c3fb27SDimitry Andric ///
387606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVD instruction.
387706c3fb27SDimitry Andric ///
387806c3fb27SDimitry Andric /// \param __X
387906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] to be shifted.
388006c3fb27SDimitry Andric /// \param __Y
388106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
388206c3fb27SDimitry Andric ///    bits).
388306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result.
38840b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
38850b57cec5SDimitry Andric _mm256_srlv_epi32(__m256i __X, __m256i __Y)
38860b57cec5SDimitry Andric {
38870b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
38880b57cec5SDimitry Andric }
38890b57cec5SDimitry Andric 
389006c3fb27SDimitry Andric /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
389106c3fb27SDimitry Andric ///    right by the number of bits given in the corresponding element of the
389206c3fb27SDimitry Andric ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
389306c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
389406c3fb27SDimitry Andric ///    31, the result for that element is zero.
389506c3fb27SDimitry Andric ///
389606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
389706c3fb27SDimitry Andric ///
389806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVD instruction.
389906c3fb27SDimitry Andric ///
390006c3fb27SDimitry Andric /// \param __X
390106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] to be shifted.
390206c3fb27SDimitry Andric /// \param __Y
390306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
390406c3fb27SDimitry Andric ///    bits).
390506c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result.
39060b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
39070b57cec5SDimitry Andric _mm_srlv_epi32(__m128i __X, __m128i __Y)
39080b57cec5SDimitry Andric {
39090b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
39100b57cec5SDimitry Andric }
39110b57cec5SDimitry Andric 
391206c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
391306c3fb27SDimitry Andric ///    right by the number of bits given in the corresponding element of the
391406c3fb27SDimitry Andric ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
391506c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
391606c3fb27SDimitry Andric ///    63, the result for that element is zero.
391706c3fb27SDimitry Andric ///
391806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
391906c3fb27SDimitry Andric ///
392006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVQ instruction.
392106c3fb27SDimitry Andric ///
392206c3fb27SDimitry Andric /// \param __X
392306c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] to be shifted.
392406c3fb27SDimitry Andric /// \param __Y
392506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
392606c3fb27SDimitry Andric ///    bits).
392706c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result.
39280b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
39290b57cec5SDimitry Andric _mm256_srlv_epi64(__m256i __X, __m256i __Y)
39300b57cec5SDimitry Andric {
39310b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
39320b57cec5SDimitry Andric }
39330b57cec5SDimitry Andric 
393406c3fb27SDimitry Andric /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
393506c3fb27SDimitry Andric ///    right by the number of bits given in the corresponding element of the
393606c3fb27SDimitry Andric ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
393706c3fb27SDimitry Andric ///    returns the result. If the shift count for any element is greater than
393806c3fb27SDimitry Andric ///    63, the result for that element is zero.
393906c3fb27SDimitry Andric ///
394006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
394106c3fb27SDimitry Andric ///
394206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVQ instruction.
394306c3fb27SDimitry Andric ///
394406c3fb27SDimitry Andric /// \param __X
394506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] to be shifted.
394606c3fb27SDimitry Andric /// \param __Y
394706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
394806c3fb27SDimitry Andric ///    bits).
394906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the result.
39500b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
39510b57cec5SDimitry Andric _mm_srlv_epi64(__m128i __X, __m128i __Y)
39520b57cec5SDimitry Andric {
39530b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
39540b57cec5SDimitry Andric }
39550b57cec5SDimitry Andric 
395606c3fb27SDimitry Andric /// Conditionally gathers two 64-bit floating-point values, either from the
395706c3fb27SDimitry Andric ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
395806c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
395906c3fb27SDimitry Andric ///    of [2 x double] in \a mask determines the source for each element.
396006c3fb27SDimitry Andric ///
396106c3fb27SDimitry Andric /// \code{.operation}
396206c3fb27SDimitry Andric /// FOR element := 0 to 1
396306c3fb27SDimitry Andric ///   j := element*64
396406c3fb27SDimitry Andric ///   k := element*32
396506c3fb27SDimitry Andric ///   IF mask[j+63] == 0
396606c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
396706c3fb27SDimitry Andric ///   ELSE
396806c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
396906c3fb27SDimitry Andric ///   FI
397006c3fb27SDimitry Andric /// ENDFOR
397106c3fb27SDimitry Andric /// \endcode
397206c3fb27SDimitry Andric ///
397306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
397406c3fb27SDimitry Andric ///
397506c3fb27SDimitry Andric /// \code
397606c3fb27SDimitry Andric /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
397706c3fb27SDimitry Andric ///                               __m128d mask, const int s);
397806c3fb27SDimitry Andric /// \endcode
397906c3fb27SDimitry Andric ///
398006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction.
398106c3fb27SDimitry Andric ///
398206c3fb27SDimitry Andric /// \param a
398306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
398406c3fb27SDimitry Andric ///    zero.
398506c3fb27SDimitry Andric /// \param m
398606c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
398706c3fb27SDimitry Andric /// \param i
398806c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
398906c3fb27SDimitry Andric ///    the first two elements are used.
399006c3fb27SDimitry Andric /// \param mask
399106c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the mask. The most
399206c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
399306c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
399406c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
399506c3fb27SDimitry Andric /// \param s
399606c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
399706c3fb27SDimitry Andric ///    1, 2, 4, or 8.
399806c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values.
39990b57cec5SDimitry Andric #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000349cc55cSDimitry Andric   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
40010b57cec5SDimitry Andric                                       (double const *)(m), \
40020b57cec5SDimitry Andric                                       (__v4si)(__m128i)(i), \
4003349cc55cSDimitry Andric                                       (__v2df)(__m128d)(mask), (s)))
40040b57cec5SDimitry Andric 
400506c3fb27SDimitry Andric /// Conditionally gathers four 64-bit floating-point values, either from the
400606c3fb27SDimitry Andric ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
400706c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
400806c3fb27SDimitry Andric ///    of [4 x double] in \a mask determines the source for each element.
400906c3fb27SDimitry Andric ///
401006c3fb27SDimitry Andric /// \code{.operation}
401106c3fb27SDimitry Andric /// FOR element := 0 to 3
401206c3fb27SDimitry Andric ///   j := element*64
401306c3fb27SDimitry Andric ///   k := element*32
401406c3fb27SDimitry Andric ///   IF mask[j+63] == 0
401506c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
401606c3fb27SDimitry Andric ///   ELSE
401706c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
401806c3fb27SDimitry Andric ///   FI
401906c3fb27SDimitry Andric /// ENDFOR
402006c3fb27SDimitry Andric /// \endcode
402106c3fb27SDimitry Andric ///
402206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
402306c3fb27SDimitry Andric ///
402406c3fb27SDimitry Andric /// \code
402506c3fb27SDimitry Andric /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
402606c3fb27SDimitry Andric ///                                  __m256d mask, const int s);
402706c3fb27SDimitry Andric /// \endcode
402806c3fb27SDimitry Andric ///
402906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction.
403006c3fb27SDimitry Andric ///
403106c3fb27SDimitry Andric /// \param a
403206c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
403306c3fb27SDimitry Andric ///    zero.
403406c3fb27SDimitry Andric /// \param m
403506c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
403606c3fb27SDimitry Andric /// \param i
403706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
403806c3fb27SDimitry Andric /// \param mask
403906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the mask. The most
404006c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
404106c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
404206c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
404306c3fb27SDimitry Andric /// \param s
404406c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
404506c3fb27SDimitry Andric ///    1, 2, 4, or 8.
404606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values.
40470b57cec5SDimitry Andric #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048349cc55cSDimitry Andric   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
40490b57cec5SDimitry Andric                                          (double const *)(m), \
40500b57cec5SDimitry Andric                                          (__v4si)(__m128i)(i), \
4051349cc55cSDimitry Andric                                          (__v4df)(__m256d)(mask), (s)))
40520b57cec5SDimitry Andric 
405306c3fb27SDimitry Andric /// Conditionally gathers two 64-bit floating-point values, either from the
405406c3fb27SDimitry Andric ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
405506c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
405606c3fb27SDimitry Andric ///    of [2 x double] in \a mask determines the source for each element.
405706c3fb27SDimitry Andric ///
405806c3fb27SDimitry Andric /// \code{.operation}
405906c3fb27SDimitry Andric /// FOR element := 0 to 1
406006c3fb27SDimitry Andric ///   j := element*64
406106c3fb27SDimitry Andric ///   k := element*64
406206c3fb27SDimitry Andric ///   IF mask[j+63] == 0
406306c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
406406c3fb27SDimitry Andric ///   ELSE
406506c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
406606c3fb27SDimitry Andric ///   FI
406706c3fb27SDimitry Andric /// ENDFOR
406806c3fb27SDimitry Andric /// \endcode
406906c3fb27SDimitry Andric ///
407006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
407106c3fb27SDimitry Andric ///
407206c3fb27SDimitry Andric /// \code
407306c3fb27SDimitry Andric /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
407406c3fb27SDimitry Andric ///                               __m128d mask, const int s);
407506c3fb27SDimitry Andric /// \endcode
407606c3fb27SDimitry Andric ///
407706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction.
407806c3fb27SDimitry Andric ///
407906c3fb27SDimitry Andric /// \param a
408006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
408106c3fb27SDimitry Andric ///    zero.
408206c3fb27SDimitry Andric /// \param m
408306c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
408406c3fb27SDimitry Andric /// \param i
408506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
408606c3fb27SDimitry Andric /// \param mask
408706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the mask. The most
408806c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
408906c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
409006c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
409106c3fb27SDimitry Andric /// \param s
409206c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
409306c3fb27SDimitry Andric ///    1, 2, 4, or 8.
409406c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values.
40950b57cec5SDimitry Andric #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096349cc55cSDimitry Andric   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
40970b57cec5SDimitry Andric                                       (double const *)(m), \
40980b57cec5SDimitry Andric                                       (__v2di)(__m128i)(i), \
4099349cc55cSDimitry Andric                                       (__v2df)(__m128d)(mask), (s)))
41000b57cec5SDimitry Andric 
410106c3fb27SDimitry Andric /// Conditionally gathers four 64-bit floating-point values, either from the
410206c3fb27SDimitry Andric ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
410306c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
410406c3fb27SDimitry Andric ///    of [4 x double] in \a mask determines the source for each element.
410506c3fb27SDimitry Andric ///
410606c3fb27SDimitry Andric /// \code{.operation}
410706c3fb27SDimitry Andric /// FOR element := 0 to 3
410806c3fb27SDimitry Andric ///   j := element*64
410906c3fb27SDimitry Andric ///   k := element*64
411006c3fb27SDimitry Andric ///   IF mask[j+63] == 0
411106c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
411206c3fb27SDimitry Andric ///   ELSE
411306c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
411406c3fb27SDimitry Andric ///   FI
411506c3fb27SDimitry Andric /// ENDFOR
411606c3fb27SDimitry Andric /// \endcode
411706c3fb27SDimitry Andric ///
411806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
411906c3fb27SDimitry Andric ///
412006c3fb27SDimitry Andric /// \code
412106c3fb27SDimitry Andric /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
412206c3fb27SDimitry Andric ///                                  __m256d mask, const int s);
412306c3fb27SDimitry Andric /// \endcode
412406c3fb27SDimitry Andric ///
412506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction.
412606c3fb27SDimitry Andric ///
412706c3fb27SDimitry Andric /// \param a
412806c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
412906c3fb27SDimitry Andric ///    zero.
413006c3fb27SDimitry Andric /// \param m
413106c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
413206c3fb27SDimitry Andric /// \param i
413306c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
413406c3fb27SDimitry Andric /// \param mask
413506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the mask. The most
413606c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
413706c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
413806c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
413906c3fb27SDimitry Andric /// \param s
414006c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
414106c3fb27SDimitry Andric ///    1, 2, 4, or 8.
414206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values.
41430b57cec5SDimitry Andric #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144349cc55cSDimitry Andric   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
41450b57cec5SDimitry Andric                                          (double const *)(m), \
41460b57cec5SDimitry Andric                                          (__v4di)(__m256i)(i), \
4147349cc55cSDimitry Andric                                          (__v4df)(__m256d)(mask), (s)))
41480b57cec5SDimitry Andric 
414906c3fb27SDimitry Andric /// Conditionally gathers four 32-bit floating-point values, either from the
415006c3fb27SDimitry Andric ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
415106c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
415206c3fb27SDimitry Andric ///    of [4 x float] in \a mask determines the source for each element.
415306c3fb27SDimitry Andric ///
415406c3fb27SDimitry Andric /// \code{.operation}
415506c3fb27SDimitry Andric /// FOR element := 0 to 3
415606c3fb27SDimitry Andric ///   j := element*32
415706c3fb27SDimitry Andric ///   k := element*32
415806c3fb27SDimitry Andric ///   IF mask[j+31] == 0
415906c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
416006c3fb27SDimitry Andric ///   ELSE
416106c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
416206c3fb27SDimitry Andric ///   FI
416306c3fb27SDimitry Andric /// ENDFOR
416406c3fb27SDimitry Andric /// \endcode
416506c3fb27SDimitry Andric ///
416606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
416706c3fb27SDimitry Andric ///
416806c3fb27SDimitry Andric /// \code
416906c3fb27SDimitry Andric /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
417006c3fb27SDimitry Andric ///                              __m128 mask, const int s);
417106c3fb27SDimitry Andric /// \endcode
417206c3fb27SDimitry Andric ///
417306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction.
417406c3fb27SDimitry Andric ///
417506c3fb27SDimitry Andric /// \param a
417606c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
417706c3fb27SDimitry Andric ///    zero.
417806c3fb27SDimitry Andric /// \param m
417906c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
418006c3fb27SDimitry Andric /// \param i
418106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
418206c3fb27SDimitry Andric /// \param mask
418306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the mask. The most
418406c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
418506c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
418606c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
418706c3fb27SDimitry Andric /// \param s
418806c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
418906c3fb27SDimitry Andric ///    1, 2, 4, or 8.
419006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values.
41910b57cec5SDimitry Andric #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192349cc55cSDimitry Andric   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
41930b57cec5SDimitry Andric                                      (float const *)(m), \
41940b57cec5SDimitry Andric                                      (__v4si)(__m128i)(i), \
4195349cc55cSDimitry Andric                                      (__v4sf)(__m128)(mask), (s)))
41960b57cec5SDimitry Andric 
419706c3fb27SDimitry Andric /// Conditionally gathers eight 32-bit floating-point values, either from the
419806c3fb27SDimitry Andric ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
419906c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
420006c3fb27SDimitry Andric ///    of [8 x float] in \a mask determines the source for each element.
420106c3fb27SDimitry Andric ///
420206c3fb27SDimitry Andric /// \code{.operation}
420306c3fb27SDimitry Andric /// FOR element := 0 to 7
420406c3fb27SDimitry Andric ///   j := element*32
420506c3fb27SDimitry Andric ///   k := element*32
420606c3fb27SDimitry Andric ///   IF mask[j+31] == 0
420706c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
420806c3fb27SDimitry Andric ///   ELSE
420906c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
421006c3fb27SDimitry Andric ///   FI
421106c3fb27SDimitry Andric /// ENDFOR
421206c3fb27SDimitry Andric /// \endcode
421306c3fb27SDimitry Andric ///
421406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
421506c3fb27SDimitry Andric ///
421606c3fb27SDimitry Andric /// \code
421706c3fb27SDimitry Andric /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
421806c3fb27SDimitry Andric ///                                 __m256 mask, const int s);
421906c3fb27SDimitry Andric /// \endcode
422006c3fb27SDimitry Andric ///
422106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction.
422206c3fb27SDimitry Andric ///
422306c3fb27SDimitry Andric /// \param a
422406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
422506c3fb27SDimitry Andric ///    zero.
422606c3fb27SDimitry Andric /// \param m
422706c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
422806c3fb27SDimitry Andric /// \param i
422906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
423006c3fb27SDimitry Andric /// \param mask
423106c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the mask. The most
423206c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
423306c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
423406c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
423506c3fb27SDimitry Andric /// \param s
423606c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
423706c3fb27SDimitry Andric ///    1, 2, 4, or 8.
423806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the gathered values.
42390b57cec5SDimitry Andric #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240349cc55cSDimitry Andric   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
42410b57cec5SDimitry Andric                                         (float const *)(m), \
42420b57cec5SDimitry Andric                                         (__v8si)(__m256i)(i), \
4243349cc55cSDimitry Andric                                         (__v8sf)(__m256)(mask), (s)))
42440b57cec5SDimitry Andric 
424506c3fb27SDimitry Andric /// Conditionally gathers two 32-bit floating-point values, either from the
424606c3fb27SDimitry Andric ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
424706c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
424806c3fb27SDimitry Andric ///    of [4 x float] in \a mask determines the source for the lower two
424906c3fb27SDimitry Andric ///    elements. The upper two elements of the result are zeroed.
425006c3fb27SDimitry Andric ///
425106c3fb27SDimitry Andric /// \code{.operation}
425206c3fb27SDimitry Andric /// FOR element := 0 to 1
425306c3fb27SDimitry Andric ///   j := element*32
425406c3fb27SDimitry Andric ///   k := element*64
425506c3fb27SDimitry Andric ///   IF mask[j+31] == 0
425606c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
425706c3fb27SDimitry Andric ///   ELSE
425806c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
425906c3fb27SDimitry Andric ///   FI
426006c3fb27SDimitry Andric /// ENDFOR
426106c3fb27SDimitry Andric /// result[127:64] := 0
426206c3fb27SDimitry Andric /// \endcode
426306c3fb27SDimitry Andric ///
426406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
426506c3fb27SDimitry Andric ///
426606c3fb27SDimitry Andric /// \code
426706c3fb27SDimitry Andric /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
426806c3fb27SDimitry Andric ///                              __m128 mask, const int s);
426906c3fb27SDimitry Andric /// \endcode
427006c3fb27SDimitry Andric ///
427106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction.
427206c3fb27SDimitry Andric ///
427306c3fb27SDimitry Andric /// \param a
427406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
427506c3fb27SDimitry Andric ///    zero. Only the first two elements are used.
427606c3fb27SDimitry Andric /// \param m
427706c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
427806c3fb27SDimitry Andric /// \param i
427906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
428006c3fb27SDimitry Andric /// \param mask
428106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the mask. The most
428206c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
428306c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
428406c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory. Only the first
428506c3fb27SDimitry Andric ///    two elements are used.
428606c3fb27SDimitry Andric /// \param s
428706c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
428806c3fb27SDimitry Andric ///    1, 2, 4, or 8.
428906c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values.
42900b57cec5SDimitry Andric #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291349cc55cSDimitry Andric   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
42920b57cec5SDimitry Andric                                      (float const *)(m), \
42930b57cec5SDimitry Andric                                      (__v2di)(__m128i)(i), \
4294349cc55cSDimitry Andric                                      (__v4sf)(__m128)(mask), (s)))
42950b57cec5SDimitry Andric 
429606c3fb27SDimitry Andric /// Conditionally gathers four 32-bit floating-point values, either from the
429706c3fb27SDimitry Andric ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
429806c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
429906c3fb27SDimitry Andric ///    of [4 x float] in \a mask determines the source for each element.
430006c3fb27SDimitry Andric ///
430106c3fb27SDimitry Andric /// \code{.operation}
430206c3fb27SDimitry Andric /// FOR element := 0 to 3
430306c3fb27SDimitry Andric ///   j := element*32
430406c3fb27SDimitry Andric ///   k := element*64
430506c3fb27SDimitry Andric ///   IF mask[j+31] == 0
430606c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
430706c3fb27SDimitry Andric ///   ELSE
430806c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
430906c3fb27SDimitry Andric ///   FI
431006c3fb27SDimitry Andric /// ENDFOR
431106c3fb27SDimitry Andric /// \endcode
431206c3fb27SDimitry Andric ///
431306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
431406c3fb27SDimitry Andric ///
431506c3fb27SDimitry Andric /// \code
431606c3fb27SDimitry Andric /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
431706c3fb27SDimitry Andric ///                                 __m128 mask, const int s);
431806c3fb27SDimitry Andric /// \endcode
431906c3fb27SDimitry Andric ///
432006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction.
432106c3fb27SDimitry Andric ///
432206c3fb27SDimitry Andric /// \param a
432306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
432406c3fb27SDimitry Andric ///   zero.
432506c3fb27SDimitry Andric /// \param m
432606c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
432706c3fb27SDimitry Andric /// \param i
432806c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
432906c3fb27SDimitry Andric /// \param mask
433006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the mask. The most
433106c3fb27SDimitry Andric ///    significant bit of each element in the mask vector represents the mask
433206c3fb27SDimitry Andric ///    bits. If a mask bit is zero, the corresponding value from vector \a a
433306c3fb27SDimitry Andric ///    is gathered; otherwise the value is loaded from memory.
433406c3fb27SDimitry Andric /// \param s
433506c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
433606c3fb27SDimitry Andric ///    1, 2, 4, or 8.
433706c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values.
43380b57cec5SDimitry Andric #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339349cc55cSDimitry Andric   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
43400b57cec5SDimitry Andric                                         (float const *)(m), \
43410b57cec5SDimitry Andric                                         (__v4di)(__m256i)(i), \
4342349cc55cSDimitry Andric                                         (__v4sf)(__m128)(mask), (s)))
43430b57cec5SDimitry Andric 
434406c3fb27SDimitry Andric /// Conditionally gathers four 32-bit integer values, either from the
434506c3fb27SDimitry Andric ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
434606c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
434706c3fb27SDimitry Andric ///    of [4 x i32] in \a mask determines the source for each element.
434806c3fb27SDimitry Andric ///
434906c3fb27SDimitry Andric /// \code{.operation}
435006c3fb27SDimitry Andric /// FOR element := 0 to 3
435106c3fb27SDimitry Andric ///   j := element*32
435206c3fb27SDimitry Andric ///   k := element*32
435306c3fb27SDimitry Andric ///   IF mask[j+31] == 0
435406c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
435506c3fb27SDimitry Andric ///   ELSE
435606c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
435706c3fb27SDimitry Andric ///   FI
435806c3fb27SDimitry Andric /// ENDFOR
435906c3fb27SDimitry Andric /// \endcode
436006c3fb27SDimitry Andric ///
436106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
436206c3fb27SDimitry Andric ///
436306c3fb27SDimitry Andric /// \code
436406c3fb27SDimitry Andric /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
436506c3fb27SDimitry Andric ///                                  __m128i mask, const int s);
436606c3fb27SDimitry Andric /// \endcode
436706c3fb27SDimitry Andric ///
436806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction.
436906c3fb27SDimitry Andric ///
437006c3fb27SDimitry Andric /// \param a
437106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
437206c3fb27SDimitry Andric ///    zero.
437306c3fb27SDimitry Andric /// \param m
437406c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
437506c3fb27SDimitry Andric /// \param i
437606c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
437706c3fb27SDimitry Andric /// \param mask
437806c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
437906c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
438006c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
438106c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
438206c3fb27SDimitry Andric /// \param s
438306c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
438406c3fb27SDimitry Andric ///    1, 2, 4, or 8.
438506c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
43860b57cec5SDimitry Andric #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
43880b57cec5SDimitry Andric                                      (int const *)(m), \
43890b57cec5SDimitry Andric                                      (__v4si)(__m128i)(i), \
4390349cc55cSDimitry Andric                                      (__v4si)(__m128i)(mask), (s)))
43910b57cec5SDimitry Andric 
439206c3fb27SDimitry Andric /// Conditionally gathers eight 32-bit integer values, either from the
439306c3fb27SDimitry Andric ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
439406c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
439506c3fb27SDimitry Andric ///    of [8 x i32] in \a mask determines the source for each element.
439606c3fb27SDimitry Andric ///
439706c3fb27SDimitry Andric /// \code{.operation}
439806c3fb27SDimitry Andric /// FOR element := 0 to 7
439906c3fb27SDimitry Andric ///   j := element*32
440006c3fb27SDimitry Andric ///   k := element*32
440106c3fb27SDimitry Andric ///   IF mask[j+31] == 0
440206c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
440306c3fb27SDimitry Andric ///   ELSE
440406c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
440506c3fb27SDimitry Andric ///   FI
440606c3fb27SDimitry Andric /// ENDFOR
440706c3fb27SDimitry Andric /// \endcode
440806c3fb27SDimitry Andric ///
440906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
441006c3fb27SDimitry Andric ///
441106c3fb27SDimitry Andric /// \code
441206c3fb27SDimitry Andric /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
441306c3fb27SDimitry Andric ///                                     __m256i mask, const int s);
441406c3fb27SDimitry Andric /// \endcode
441506c3fb27SDimitry Andric ///
441606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction.
441706c3fb27SDimitry Andric ///
441806c3fb27SDimitry Andric /// \param a
441906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
442006c3fb27SDimitry Andric ///    zero.
442106c3fb27SDimitry Andric /// \param m
442206c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
442306c3fb27SDimitry Andric /// \param i
442406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
442506c3fb27SDimitry Andric /// \param mask
442606c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
442706c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
442806c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
442906c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
443006c3fb27SDimitry Andric /// \param s
443106c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
443206c3fb27SDimitry Andric ///    1, 2, 4, or 8.
443306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
44340b57cec5SDimitry Andric #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
44360b57cec5SDimitry Andric                                         (int const *)(m), \
44370b57cec5SDimitry Andric                                         (__v8si)(__m256i)(i), \
4438349cc55cSDimitry Andric                                         (__v8si)(__m256i)(mask), (s)))
44390b57cec5SDimitry Andric 
444006c3fb27SDimitry Andric /// Conditionally gathers two 32-bit integer values, either from the
444106c3fb27SDimitry Andric ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
444206c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
444306c3fb27SDimitry Andric ///    of [4 x i32] in \a mask determines the source for the lower two
444406c3fb27SDimitry Andric ///    elements. The upper two elements of the result are zeroed.
444506c3fb27SDimitry Andric ///
444606c3fb27SDimitry Andric /// \code{.operation}
444706c3fb27SDimitry Andric /// FOR element := 0 to 1
444806c3fb27SDimitry Andric ///   j := element*32
444906c3fb27SDimitry Andric ///   k := element*64
445006c3fb27SDimitry Andric ///   IF mask[j+31] == 0
445106c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
445206c3fb27SDimitry Andric ///   ELSE
445306c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
445406c3fb27SDimitry Andric ///   FI
445506c3fb27SDimitry Andric /// ENDFOR
445606c3fb27SDimitry Andric /// result[127:64] := 0
445706c3fb27SDimitry Andric /// \endcode
445806c3fb27SDimitry Andric ///
445906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
446006c3fb27SDimitry Andric ///
446106c3fb27SDimitry Andric /// \code
446206c3fb27SDimitry Andric /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
446306c3fb27SDimitry Andric ///                                  __m128i mask, const int s);
446406c3fb27SDimitry Andric /// \endcode
446506c3fb27SDimitry Andric ///
446606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction.
446706c3fb27SDimitry Andric ///
446806c3fb27SDimitry Andric /// \param a
446906c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
447006c3fb27SDimitry Andric ///   zero. Only the first two elements are used.
447106c3fb27SDimitry Andric /// \param m
447206c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
447306c3fb27SDimitry Andric /// \param i
447406c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
447506c3fb27SDimitry Andric /// \param mask
447606c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
447706c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
447806c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
447906c3fb27SDimitry Andric ///    otherwise the value is loaded from memory. Only the first two elements
448006c3fb27SDimitry Andric ///    are used.
448106c3fb27SDimitry Andric /// \param s
448206c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
448306c3fb27SDimitry Andric ///    1, 2, 4, or 8.
448406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
44850b57cec5SDimitry Andric #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
44870b57cec5SDimitry Andric                                      (int const *)(m), \
44880b57cec5SDimitry Andric                                      (__v2di)(__m128i)(i), \
4489349cc55cSDimitry Andric                                      (__v4si)(__m128i)(mask), (s)))
44900b57cec5SDimitry Andric 
449106c3fb27SDimitry Andric /// Conditionally gathers four 32-bit integer values, either from the
449206c3fb27SDimitry Andric ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
449306c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
449406c3fb27SDimitry Andric ///    of [4 x i32] in \a mask determines the source for each element.
449506c3fb27SDimitry Andric ///
449606c3fb27SDimitry Andric /// \code{.operation}
449706c3fb27SDimitry Andric /// FOR element := 0 to 3
449806c3fb27SDimitry Andric ///   j := element*32
449906c3fb27SDimitry Andric ///   k := element*64
450006c3fb27SDimitry Andric ///   IF mask[j+31] == 0
450106c3fb27SDimitry Andric ///     result[j+31:j] := a[j+31:j]
450206c3fb27SDimitry Andric ///   ELSE
450306c3fb27SDimitry Andric ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
450406c3fb27SDimitry Andric ///   FI
450506c3fb27SDimitry Andric /// ENDFOR
450606c3fb27SDimitry Andric /// \endcode
450706c3fb27SDimitry Andric ///
450806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
450906c3fb27SDimitry Andric ///
451006c3fb27SDimitry Andric /// \code
451106c3fb27SDimitry Andric /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
451206c3fb27SDimitry Andric ///                                     __m128i mask, const int s);
451306c3fb27SDimitry Andric /// \endcode
451406c3fb27SDimitry Andric ///
451506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction.
451606c3fb27SDimitry Andric ///
451706c3fb27SDimitry Andric /// \param a
451806c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
451906c3fb27SDimitry Andric ///    zero.
452006c3fb27SDimitry Andric /// \param m
452106c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
452206c3fb27SDimitry Andric /// \param i
452306c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
452406c3fb27SDimitry Andric /// \param mask
452506c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
452606c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
452706c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
452806c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
452906c3fb27SDimitry Andric /// \param s
453006c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
453106c3fb27SDimitry Andric ///    1, 2, 4, or 8.
453206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
45330b57cec5SDimitry Andric #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
45350b57cec5SDimitry Andric                                         (int const *)(m), \
45360b57cec5SDimitry Andric                                         (__v4di)(__m256i)(i), \
4537349cc55cSDimitry Andric                                         (__v4si)(__m128i)(mask), (s)))
45380b57cec5SDimitry Andric 
453906c3fb27SDimitry Andric /// Conditionally gathers two 64-bit integer values, either from the
454006c3fb27SDimitry Andric ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
454106c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
454206c3fb27SDimitry Andric ///    of [2 x i64] in \a mask determines the source for each element.
454306c3fb27SDimitry Andric ///
454406c3fb27SDimitry Andric /// \code{.operation}
454506c3fb27SDimitry Andric /// FOR element := 0 to 1
454606c3fb27SDimitry Andric ///   j := element*64
454706c3fb27SDimitry Andric ///   k := element*32
454806c3fb27SDimitry Andric ///   IF mask[j+63] == 0
454906c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
455006c3fb27SDimitry Andric ///   ELSE
455106c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
455206c3fb27SDimitry Andric ///   FI
455306c3fb27SDimitry Andric /// ENDFOR
455406c3fb27SDimitry Andric /// \endcode
455506c3fb27SDimitry Andric ///
455606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
455706c3fb27SDimitry Andric ///
455806c3fb27SDimitry Andric /// \code
455906c3fb27SDimitry Andric /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
456006c3fb27SDimitry Andric ///                                  __m128i mask, const int s);
456106c3fb27SDimitry Andric /// \endcode
456206c3fb27SDimitry Andric ///
456306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
456406c3fb27SDimitry Andric ///
456506c3fb27SDimitry Andric /// \param a
456606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
456706c3fb27SDimitry Andric ///    zero.
456806c3fb27SDimitry Andric /// \param m
456906c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
457006c3fb27SDimitry Andric /// \param i
457106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
457206c3fb27SDimitry Andric ///    the first two elements are used.
457306c3fb27SDimitry Andric /// \param mask
457406c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
457506c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
457606c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
457706c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
457806c3fb27SDimitry Andric /// \param s
457906c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
458006c3fb27SDimitry Andric ///    1, 2, 4, or 8.
458106c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
45820b57cec5SDimitry Andric #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
45840b57cec5SDimitry Andric                                      (long long const *)(m), \
45850b57cec5SDimitry Andric                                      (__v4si)(__m128i)(i), \
4586349cc55cSDimitry Andric                                      (__v2di)(__m128i)(mask), (s)))
45870b57cec5SDimitry Andric 
458806c3fb27SDimitry Andric /// Conditionally gathers four 64-bit integer values, either from the
458906c3fb27SDimitry Andric ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
459006c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
459106c3fb27SDimitry Andric ///    of [4 x i64] in \a mask determines the source for each element.
459206c3fb27SDimitry Andric ///
459306c3fb27SDimitry Andric /// \code{.operation}
459406c3fb27SDimitry Andric /// FOR element := 0 to 3
459506c3fb27SDimitry Andric ///   j := element*64
459606c3fb27SDimitry Andric ///   k := element*32
459706c3fb27SDimitry Andric ///   IF mask[j+63] == 0
459806c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
459906c3fb27SDimitry Andric ///   ELSE
460006c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
460106c3fb27SDimitry Andric ///   FI
460206c3fb27SDimitry Andric /// ENDFOR
460306c3fb27SDimitry Andric /// \endcode
460406c3fb27SDimitry Andric ///
460506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
460606c3fb27SDimitry Andric ///
460706c3fb27SDimitry Andric /// \code
460806c3fb27SDimitry Andric /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
460906c3fb27SDimitry Andric ///                                     __m128i i, __m256i mask, const int s);
461006c3fb27SDimitry Andric /// \endcode
461106c3fb27SDimitry Andric ///
461206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
461306c3fb27SDimitry Andric ///
461406c3fb27SDimitry Andric /// \param a
461506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
461606c3fb27SDimitry Andric ///    zero.
461706c3fb27SDimitry Andric /// \param m
461806c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
461906c3fb27SDimitry Andric /// \param i
462006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
462106c3fb27SDimitry Andric /// \param mask
462206c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
462306c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
462406c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
462506c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
462606c3fb27SDimitry Andric /// \param s
462706c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
462806c3fb27SDimitry Andric ///    1, 2, 4, or 8.
462906c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
46300b57cec5SDimitry Andric #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
46320b57cec5SDimitry Andric                                         (long long const *)(m), \
46330b57cec5SDimitry Andric                                         (__v4si)(__m128i)(i), \
4634349cc55cSDimitry Andric                                         (__v4di)(__m256i)(mask), (s)))
46350b57cec5SDimitry Andric 
463606c3fb27SDimitry Andric /// Conditionally gathers two 64-bit integer values, either from the
463706c3fb27SDimitry Andric ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
463806c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
463906c3fb27SDimitry Andric ///    of [2 x i64] in \a mask determines the source for each element.
464006c3fb27SDimitry Andric ///
464106c3fb27SDimitry Andric /// \code{.operation}
464206c3fb27SDimitry Andric /// FOR element := 0 to 1
464306c3fb27SDimitry Andric ///   j := element*64
464406c3fb27SDimitry Andric ///   k := element*64
464506c3fb27SDimitry Andric ///   IF mask[j+63] == 0
464606c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
464706c3fb27SDimitry Andric ///   ELSE
464806c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
464906c3fb27SDimitry Andric ///   FI
465006c3fb27SDimitry Andric /// ENDFOR
465106c3fb27SDimitry Andric /// \endcode
465206c3fb27SDimitry Andric ///
465306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
465406c3fb27SDimitry Andric ///
465506c3fb27SDimitry Andric /// \code
465606c3fb27SDimitry Andric /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
465706c3fb27SDimitry Andric ///                                  __m128i mask, const int s);
465806c3fb27SDimitry Andric /// \endcode
465906c3fb27SDimitry Andric ///
466006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
466106c3fb27SDimitry Andric ///
466206c3fb27SDimitry Andric /// \param a
466306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
466406c3fb27SDimitry Andric ///    zero.
466506c3fb27SDimitry Andric /// \param m
466606c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
466706c3fb27SDimitry Andric /// \param i
466806c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
466906c3fb27SDimitry Andric /// \param mask
467006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
467106c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
467206c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
467306c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
467406c3fb27SDimitry Andric /// \param s
467506c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
467606c3fb27SDimitry Andric ///    1, 2, 4, or 8.
467706c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
46780b57cec5SDimitry Andric #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
46800b57cec5SDimitry Andric                                      (long long const *)(m), \
46810b57cec5SDimitry Andric                                      (__v2di)(__m128i)(i), \
4682349cc55cSDimitry Andric                                      (__v2di)(__m128i)(mask), (s)))
46830b57cec5SDimitry Andric 
468406c3fb27SDimitry Andric /// Conditionally gathers four 64-bit integer values, either from the
468506c3fb27SDimitry Andric ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
468606c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
468706c3fb27SDimitry Andric ///    of [4 x i64] in \a mask determines the source for each element.
468806c3fb27SDimitry Andric ///
468906c3fb27SDimitry Andric /// \code{.operation}
469006c3fb27SDimitry Andric /// FOR element := 0 to 3
469106c3fb27SDimitry Andric ///   j := element*64
469206c3fb27SDimitry Andric ///   k := element*64
469306c3fb27SDimitry Andric ///   IF mask[j+63] == 0
469406c3fb27SDimitry Andric ///     result[j+63:j] := a[j+63:j]
469506c3fb27SDimitry Andric ///   ELSE
469606c3fb27SDimitry Andric ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
469706c3fb27SDimitry Andric ///   FI
469806c3fb27SDimitry Andric /// ENDFOR
469906c3fb27SDimitry Andric /// \endcode
470006c3fb27SDimitry Andric ///
470106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
470206c3fb27SDimitry Andric ///
470306c3fb27SDimitry Andric /// \code
470406c3fb27SDimitry Andric /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
470506c3fb27SDimitry Andric ///                                     __m256i i, __m256i mask, const int s);
470606c3fb27SDimitry Andric /// \endcode
470706c3fb27SDimitry Andric ///
470806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
470906c3fb27SDimitry Andric ///
471006c3fb27SDimitry Andric /// \param a
471106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
471206c3fb27SDimitry Andric ///    zero.
471306c3fb27SDimitry Andric /// \param m
471406c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
471506c3fb27SDimitry Andric /// \param i
471606c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
471706c3fb27SDimitry Andric /// \param mask
471806c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
471906c3fb27SDimitry Andric ///    bit of each element in the mask vector represents the mask bits. If a
472006c3fb27SDimitry Andric ///    mask bit is zero, the corresponding value from vector \a a is gathered;
472106c3fb27SDimitry Andric ///    otherwise the value is loaded from memory.
472206c3fb27SDimitry Andric /// \param s
472306c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
472406c3fb27SDimitry Andric ///    1, 2, 4, or 8.
472506c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
47260b57cec5SDimitry Andric #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
47280b57cec5SDimitry Andric                                         (long long const *)(m), \
47290b57cec5SDimitry Andric                                         (__v4di)(__m256i)(i), \
4730349cc55cSDimitry Andric                                         (__v4di)(__m256i)(mask), (s)))
47310b57cec5SDimitry Andric 
473206c3fb27SDimitry Andric /// Gathers two 64-bit floating-point values from memory \a m using scaled
473306c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i.
473406c3fb27SDimitry Andric ///
473506c3fb27SDimitry Andric /// \code{.operation}
473606c3fb27SDimitry Andric /// FOR element := 0 to 1
473706c3fb27SDimitry Andric ///   j := element*64
473806c3fb27SDimitry Andric ///   k := element*32
473906c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
474006c3fb27SDimitry Andric /// ENDFOR
474106c3fb27SDimitry Andric /// \endcode
474206c3fb27SDimitry Andric ///
474306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
474406c3fb27SDimitry Andric ///
474506c3fb27SDimitry Andric /// \code
474606c3fb27SDimitry Andric /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
474706c3fb27SDimitry Andric /// \endcode
474806c3fb27SDimitry Andric ///
474906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction.
475006c3fb27SDimitry Andric ///
475106c3fb27SDimitry Andric /// \param m
475206c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
475306c3fb27SDimitry Andric /// \param i
475406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
475506c3fb27SDimitry Andric ///    the first two elements are used.
475606c3fb27SDimitry Andric /// \param s
475706c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
475806c3fb27SDimitry Andric ///    1, 2, 4, or 8.
475906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values.
47600b57cec5SDimitry Andric #define _mm_i32gather_pd(m, i, s) \
4761349cc55cSDimitry Andric   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
47620b57cec5SDimitry Andric                                       (double const *)(m), \
47630b57cec5SDimitry Andric                                       (__v4si)(__m128i)(i), \
47640b57cec5SDimitry Andric                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
47650b57cec5SDimitry Andric                                                            _mm_setzero_pd()), \
4766349cc55cSDimitry Andric                                       (s)))
47670b57cec5SDimitry Andric 
476806c3fb27SDimitry Andric /// Gathers four 64-bit floating-point values from memory \a m using scaled
476906c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i.
477006c3fb27SDimitry Andric ///
477106c3fb27SDimitry Andric /// \code{.operation}
477206c3fb27SDimitry Andric /// FOR element := 0 to 3
477306c3fb27SDimitry Andric ///   j := element*64
477406c3fb27SDimitry Andric ///   k := element*32
477506c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
477606c3fb27SDimitry Andric /// ENDFOR
477706c3fb27SDimitry Andric /// \endcode
477806c3fb27SDimitry Andric ///
477906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
478006c3fb27SDimitry Andric ///
478106c3fb27SDimitry Andric /// \code
478206c3fb27SDimitry Andric /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
478306c3fb27SDimitry Andric /// \endcode
478406c3fb27SDimitry Andric ///
478506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction.
478606c3fb27SDimitry Andric ///
478706c3fb27SDimitry Andric /// \param m
478806c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
478906c3fb27SDimitry Andric /// \param i
479006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
479106c3fb27SDimitry Andric /// \param s
479206c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
479306c3fb27SDimitry Andric ///    1, 2, 4, or 8.
479406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values.
47950b57cec5SDimitry Andric #define _mm256_i32gather_pd(m, i, s) \
4796349cc55cSDimitry Andric   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
47970b57cec5SDimitry Andric                                          (double const *)(m), \
47980b57cec5SDimitry Andric                                          (__v4si)(__m128i)(i), \
47990b57cec5SDimitry Andric                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
48000b57cec5SDimitry Andric                                                                _mm256_setzero_pd(), \
48010b57cec5SDimitry Andric                                                                _CMP_EQ_OQ), \
4802349cc55cSDimitry Andric                                          (s)))
48030b57cec5SDimitry Andric 
480406c3fb27SDimitry Andric /// Gathers two 64-bit floating-point values from memory \a m using scaled
480506c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [2 x i64] in \a i.
480606c3fb27SDimitry Andric ///
480706c3fb27SDimitry Andric /// \code{.operation}
480806c3fb27SDimitry Andric /// FOR element := 0 to 1
480906c3fb27SDimitry Andric ///   j := element*64
481006c3fb27SDimitry Andric ///   k := element*64
481106c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
481206c3fb27SDimitry Andric /// ENDFOR
481306c3fb27SDimitry Andric /// \endcode
481406c3fb27SDimitry Andric ///
481506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
481606c3fb27SDimitry Andric ///
481706c3fb27SDimitry Andric /// \code
481806c3fb27SDimitry Andric /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
481906c3fb27SDimitry Andric /// \endcode
482006c3fb27SDimitry Andric ///
482106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction.
482206c3fb27SDimitry Andric ///
482306c3fb27SDimitry Andric /// \param m
482406c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
482506c3fb27SDimitry Andric /// \param i
482606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
482706c3fb27SDimitry Andric /// \param s
482806c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
482906c3fb27SDimitry Andric ///    1, 2, 4, or 8.
483006c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values.
48310b57cec5SDimitry Andric #define _mm_i64gather_pd(m, i, s) \
4832349cc55cSDimitry Andric   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
48330b57cec5SDimitry Andric                                       (double const *)(m), \
48340b57cec5SDimitry Andric                                       (__v2di)(__m128i)(i), \
48350b57cec5SDimitry Andric                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
48360b57cec5SDimitry Andric                                                            _mm_setzero_pd()), \
4837349cc55cSDimitry Andric                                       (s)))
48380b57cec5SDimitry Andric 
483906c3fb27SDimitry Andric /// Gathers four 64-bit floating-point values from memory \a m using scaled
484006c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [4 x i64] in \a i.
484106c3fb27SDimitry Andric ///
484206c3fb27SDimitry Andric /// \code{.operation}
484306c3fb27SDimitry Andric /// FOR element := 0 to 3
484406c3fb27SDimitry Andric ///   j := element*64
484506c3fb27SDimitry Andric ///   k := element*64
484606c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
484706c3fb27SDimitry Andric /// ENDFOR
484806c3fb27SDimitry Andric /// \endcode
484906c3fb27SDimitry Andric ///
485006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
485106c3fb27SDimitry Andric ///
485206c3fb27SDimitry Andric /// \code
485306c3fb27SDimitry Andric /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
485406c3fb27SDimitry Andric /// \endcode
485506c3fb27SDimitry Andric ///
485606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction.
485706c3fb27SDimitry Andric ///
485806c3fb27SDimitry Andric /// \param m
485906c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
486006c3fb27SDimitry Andric /// \param i
486106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
486206c3fb27SDimitry Andric /// \param s
486306c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
486406c3fb27SDimitry Andric ///    1, 2, 4, or 8.
486506c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values.
48660b57cec5SDimitry Andric #define _mm256_i64gather_pd(m, i, s) \
4867349cc55cSDimitry Andric   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
48680b57cec5SDimitry Andric                                          (double const *)(m), \
48690b57cec5SDimitry Andric                                          (__v4di)(__m256i)(i), \
48700b57cec5SDimitry Andric                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
48710b57cec5SDimitry Andric                                                                _mm256_setzero_pd(), \
48720b57cec5SDimitry Andric                                                                _CMP_EQ_OQ), \
4873349cc55cSDimitry Andric                                          (s)))
48740b57cec5SDimitry Andric 
487506c3fb27SDimitry Andric /// Gathers four 32-bit floating-point values from memory \a m using scaled
487606c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i.
487706c3fb27SDimitry Andric ///
487806c3fb27SDimitry Andric /// \code{.operation}
487906c3fb27SDimitry Andric /// FOR element := 0 to 3
488006c3fb27SDimitry Andric ///   j := element*32
488106c3fb27SDimitry Andric ///   k := element*32
488206c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
488306c3fb27SDimitry Andric /// ENDFOR
488406c3fb27SDimitry Andric /// \endcode
488506c3fb27SDimitry Andric ///
488606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
488706c3fb27SDimitry Andric ///
488806c3fb27SDimitry Andric /// \code
488906c3fb27SDimitry Andric /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
489006c3fb27SDimitry Andric /// \endcode
489106c3fb27SDimitry Andric ///
489206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction.
489306c3fb27SDimitry Andric ///
489406c3fb27SDimitry Andric /// \param m
489506c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
489606c3fb27SDimitry Andric /// \param i
489706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
489806c3fb27SDimitry Andric /// \param s
489906c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
490006c3fb27SDimitry Andric ///    1, 2, 4, or 8.
490106c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values.
49020b57cec5SDimitry Andric #define _mm_i32gather_ps(m, i, s) \
4903349cc55cSDimitry Andric   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
49040b57cec5SDimitry Andric                                      (float const *)(m), \
49050b57cec5SDimitry Andric                                      (__v4si)(__m128i)(i), \
49060b57cec5SDimitry Andric                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
49070b57cec5SDimitry Andric                                                           _mm_setzero_ps()), \
4908349cc55cSDimitry Andric                                      (s)))
49090b57cec5SDimitry Andric 
491006c3fb27SDimitry Andric /// Gathers eight 32-bit floating-point values from memory \a m using scaled
491106c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [8 x i32] in \a i.
491206c3fb27SDimitry Andric ///
491306c3fb27SDimitry Andric /// \code{.operation}
491406c3fb27SDimitry Andric /// FOR element := 0 to 7
491506c3fb27SDimitry Andric ///   j := element*32
491606c3fb27SDimitry Andric ///   k := element*32
491706c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
491806c3fb27SDimitry Andric /// ENDFOR
491906c3fb27SDimitry Andric /// \endcode
492006c3fb27SDimitry Andric ///
492106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
492206c3fb27SDimitry Andric ///
492306c3fb27SDimitry Andric /// \code
492406c3fb27SDimitry Andric /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
492506c3fb27SDimitry Andric /// \endcode
492606c3fb27SDimitry Andric ///
492706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction.
492806c3fb27SDimitry Andric ///
492906c3fb27SDimitry Andric /// \param m
493006c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
493106c3fb27SDimitry Andric /// \param i
493206c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
493306c3fb27SDimitry Andric /// \param s
493406c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
493506c3fb27SDimitry Andric ///    1, 2, 4, or 8.
493606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the gathered values.
49370b57cec5SDimitry Andric #define _mm256_i32gather_ps(m, i, s) \
4938349cc55cSDimitry Andric   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
49390b57cec5SDimitry Andric                                         (float const *)(m), \
49400b57cec5SDimitry Andric                                         (__v8si)(__m256i)(i), \
49410b57cec5SDimitry Andric                                         (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
49420b57cec5SDimitry Andric                                                               _mm256_setzero_ps(), \
49430b57cec5SDimitry Andric                                                               _CMP_EQ_OQ), \
4944349cc55cSDimitry Andric                                         (s)))
49450b57cec5SDimitry Andric 
494606c3fb27SDimitry Andric /// Gathers two 32-bit floating-point values from memory \a m using scaled
494706c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
494806c3fb27SDimitry Andric ///    elements of the result are zeroed.
494906c3fb27SDimitry Andric ///
495006c3fb27SDimitry Andric /// \code{.operation}
495106c3fb27SDimitry Andric /// FOR element := 0 to 1
495206c3fb27SDimitry Andric ///   j := element*32
495306c3fb27SDimitry Andric ///   k := element*64
495406c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
495506c3fb27SDimitry Andric /// ENDFOR
495606c3fb27SDimitry Andric /// result[127:64] := 0
495706c3fb27SDimitry Andric /// \endcode
495806c3fb27SDimitry Andric ///
495906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
496006c3fb27SDimitry Andric ///
496106c3fb27SDimitry Andric /// \code
496206c3fb27SDimitry Andric /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
496306c3fb27SDimitry Andric /// \endcode
496406c3fb27SDimitry Andric ///
496506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction.
496606c3fb27SDimitry Andric ///
496706c3fb27SDimitry Andric /// \param m
496806c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
496906c3fb27SDimitry Andric /// \param i
497006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
497106c3fb27SDimitry Andric /// \param s
497206c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
497306c3fb27SDimitry Andric ///    1, 2, 4, or 8.
497406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values.
49750b57cec5SDimitry Andric #define _mm_i64gather_ps(m, i, s) \
4976349cc55cSDimitry Andric   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
49770b57cec5SDimitry Andric                                      (float const *)(m), \
49780b57cec5SDimitry Andric                                      (__v2di)(__m128i)(i), \
49790b57cec5SDimitry Andric                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
49800b57cec5SDimitry Andric                                                           _mm_setzero_ps()), \
4981349cc55cSDimitry Andric                                      (s)))
49820b57cec5SDimitry Andric 
498306c3fb27SDimitry Andric /// Gathers four 32-bit floating-point values from memory \a m using scaled
498406c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [4 x i64] in \a i.
498506c3fb27SDimitry Andric ///
498606c3fb27SDimitry Andric /// \code{.operation}
498706c3fb27SDimitry Andric /// FOR element := 0 to 3
498806c3fb27SDimitry Andric ///   j := element*32
498906c3fb27SDimitry Andric ///   k := element*64
499006c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
499106c3fb27SDimitry Andric /// ENDFOR
499206c3fb27SDimitry Andric /// \endcode
499306c3fb27SDimitry Andric ///
499406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
499506c3fb27SDimitry Andric ///
499606c3fb27SDimitry Andric /// \code
499706c3fb27SDimitry Andric /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
499806c3fb27SDimitry Andric /// \endcode
499906c3fb27SDimitry Andric ///
500006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction.
500106c3fb27SDimitry Andric ///
500206c3fb27SDimitry Andric /// \param m
500306c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
500406c3fb27SDimitry Andric /// \param i
500506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
500606c3fb27SDimitry Andric /// \param s
500706c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
500806c3fb27SDimitry Andric ///    1, 2, 4, or 8.
500906c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values.
50100b57cec5SDimitry Andric #define _mm256_i64gather_ps(m, i, s) \
5011349cc55cSDimitry Andric   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
50120b57cec5SDimitry Andric                                         (float const *)(m), \
50130b57cec5SDimitry Andric                                         (__v4di)(__m256i)(i), \
50140b57cec5SDimitry Andric                                         (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
50150b57cec5SDimitry Andric                                                              _mm_setzero_ps()), \
5016349cc55cSDimitry Andric                                         (s)))
50170b57cec5SDimitry Andric 
501806c3fb27SDimitry Andric /// Gathers four 32-bit floating-point values from memory \a m using scaled
501906c3fb27SDimitry Andric ///    indexes from the 128-bit vector of [4 x i32] in \a i.
502006c3fb27SDimitry Andric ///
502106c3fb27SDimitry Andric /// \code{.operation}
502206c3fb27SDimitry Andric /// FOR element := 0 to 3
502306c3fb27SDimitry Andric ///   j := element*32
502406c3fb27SDimitry Andric ///   k := element*32
502506c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
502606c3fb27SDimitry Andric /// ENDFOR
502706c3fb27SDimitry Andric /// \endcode
502806c3fb27SDimitry Andric ///
502906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
503006c3fb27SDimitry Andric ///
503106c3fb27SDimitry Andric /// \code
503206c3fb27SDimitry Andric /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
503306c3fb27SDimitry Andric /// \endcode
503406c3fb27SDimitry Andric ///
503506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction.
503606c3fb27SDimitry Andric ///
503706c3fb27SDimitry Andric /// \param m
503806c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
503906c3fb27SDimitry Andric /// \param i
504006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
504106c3fb27SDimitry Andric /// \param s
504206c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
504306c3fb27SDimitry Andric ///    1, 2, 4, or 8.
504406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
50450b57cec5SDimitry Andric #define _mm_i32gather_epi32(m, i, s) \
5046349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
50470b57cec5SDimitry Andric                                      (int const *)(m), (__v4si)(__m128i)(i), \
5048349cc55cSDimitry Andric                                      (__v4si)_mm_set1_epi32(-1), (s)))
50490b57cec5SDimitry Andric 
505006c3fb27SDimitry Andric /// Gathers eight 32-bit floating-point values from memory \a m using scaled
505106c3fb27SDimitry Andric ///    indexes from the 256-bit vector of [8 x i32] in \a i.
505206c3fb27SDimitry Andric ///
505306c3fb27SDimitry Andric /// \code{.operation}
505406c3fb27SDimitry Andric /// FOR element := 0 to 7
505506c3fb27SDimitry Andric ///   j := element*32
505606c3fb27SDimitry Andric ///   k := element*32
505706c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
505806c3fb27SDimitry Andric /// ENDFOR
505906c3fb27SDimitry Andric /// \endcode
506006c3fb27SDimitry Andric ///
506106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
506206c3fb27SDimitry Andric ///
506306c3fb27SDimitry Andric /// \code
506406c3fb27SDimitry Andric /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
506506c3fb27SDimitry Andric /// \endcode
506606c3fb27SDimitry Andric ///
506706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction.
506806c3fb27SDimitry Andric ///
506906c3fb27SDimitry Andric /// \param m
507006c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
507106c3fb27SDimitry Andric /// \param i
507206c3fb27SDimitry Andric ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
507306c3fb27SDimitry Andric /// \param s
507406c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
507506c3fb27SDimitry Andric ///    1, 2, 4, or 8.
507606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
50770b57cec5SDimitry Andric #define _mm256_i32gather_epi32(m, i, s) \
5078349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
50790b57cec5SDimitry Andric                                         (int const *)(m), (__v8si)(__m256i)(i), \
5080349cc55cSDimitry Andric                                         (__v8si)_mm256_set1_epi32(-1), (s)))
50810b57cec5SDimitry Andric 
508206c3fb27SDimitry Andric /// Gathers two 32-bit integer values from memory \a m using scaled indexes
508306c3fb27SDimitry Andric ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
508406c3fb27SDimitry Andric ///    of the result are zeroed.
508506c3fb27SDimitry Andric ///
508606c3fb27SDimitry Andric /// \code{.operation}
508706c3fb27SDimitry Andric /// FOR element := 0 to 1
508806c3fb27SDimitry Andric ///   j := element*32
508906c3fb27SDimitry Andric ///   k := element*64
509006c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
509106c3fb27SDimitry Andric /// ENDFOR
509206c3fb27SDimitry Andric /// result[127:64] := 0
509306c3fb27SDimitry Andric /// \endcode
509406c3fb27SDimitry Andric ///
509506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
509606c3fb27SDimitry Andric ///
509706c3fb27SDimitry Andric /// \code
509806c3fb27SDimitry Andric /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
509906c3fb27SDimitry Andric /// \endcode
510006c3fb27SDimitry Andric ///
510106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction.
510206c3fb27SDimitry Andric ///
510306c3fb27SDimitry Andric /// \param m
510406c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
510506c3fb27SDimitry Andric /// \param i
510606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
510706c3fb27SDimitry Andric /// \param s
510806c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
510906c3fb27SDimitry Andric ///    1, 2, 4, or 8.
511006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
51110b57cec5SDimitry Andric #define _mm_i64gather_epi32(m, i, s) \
5112349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
51130b57cec5SDimitry Andric                                      (int const *)(m), (__v2di)(__m128i)(i), \
5114349cc55cSDimitry Andric                                      (__v4si)_mm_set1_epi32(-1), (s)))
51150b57cec5SDimitry Andric 
511606c3fb27SDimitry Andric /// Gathers four 32-bit integer values from memory \a m using scaled indexes
511706c3fb27SDimitry Andric ///    from the 256-bit vector of [4 x i64] in \a i.
511806c3fb27SDimitry Andric ///
511906c3fb27SDimitry Andric /// \code{.operation}
512006c3fb27SDimitry Andric /// FOR element := 0 to 3
512106c3fb27SDimitry Andric ///   j := element*32
512206c3fb27SDimitry Andric ///   k := element*64
512306c3fb27SDimitry Andric ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
512406c3fb27SDimitry Andric /// ENDFOR
512506c3fb27SDimitry Andric /// \endcode
512606c3fb27SDimitry Andric ///
512706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
512806c3fb27SDimitry Andric ///
512906c3fb27SDimitry Andric /// \code
513006c3fb27SDimitry Andric /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
513106c3fb27SDimitry Andric /// \endcode
513206c3fb27SDimitry Andric ///
513306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction.
513406c3fb27SDimitry Andric ///
513506c3fb27SDimitry Andric /// \param m
513606c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
513706c3fb27SDimitry Andric /// \param i
513806c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
513906c3fb27SDimitry Andric /// \param s
514006c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
514106c3fb27SDimitry Andric ///    1, 2, 4, or 8.
514206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
51430b57cec5SDimitry Andric #define _mm256_i64gather_epi32(m, i, s) \
5144349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
51450b57cec5SDimitry Andric                                         (int const *)(m), (__v4di)(__m256i)(i), \
5146349cc55cSDimitry Andric                                         (__v4si)_mm_set1_epi32(-1), (s)))
51470b57cec5SDimitry Andric 
514806c3fb27SDimitry Andric /// Gathers two 64-bit integer values from memory \a m using scaled indexes
514906c3fb27SDimitry Andric ///    from the 128-bit vector of [4 x i32] in \a i.
515006c3fb27SDimitry Andric ///
515106c3fb27SDimitry Andric /// \code{.operation}
515206c3fb27SDimitry Andric /// FOR element := 0 to 1
515306c3fb27SDimitry Andric ///   j := element*64
515406c3fb27SDimitry Andric ///   k := element*32
515506c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
515606c3fb27SDimitry Andric /// ENDFOR
515706c3fb27SDimitry Andric /// \endcode
515806c3fb27SDimitry Andric ///
515906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
516006c3fb27SDimitry Andric ///
516106c3fb27SDimitry Andric /// \code
516206c3fb27SDimitry Andric /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
516306c3fb27SDimitry Andric /// \endcode
516406c3fb27SDimitry Andric ///
516506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
516606c3fb27SDimitry Andric ///
516706c3fb27SDimitry Andric /// \param m
516806c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
516906c3fb27SDimitry Andric /// \param i
517006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
517106c3fb27SDimitry Andric ///    the first two elements are used.
517206c3fb27SDimitry Andric /// \param s
517306c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
517406c3fb27SDimitry Andric ///    1, 2, 4, or 8.
517506c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
51760b57cec5SDimitry Andric #define _mm_i32gather_epi64(m, i, s) \
5177349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
51780b57cec5SDimitry Andric                                      (long long const *)(m), \
51790b57cec5SDimitry Andric                                      (__v4si)(__m128i)(i), \
5180349cc55cSDimitry Andric                                      (__v2di)_mm_set1_epi64x(-1), (s)))
51810b57cec5SDimitry Andric 
518206c3fb27SDimitry Andric /// Gathers four 64-bit integer values from memory \a m using scaled indexes
518306c3fb27SDimitry Andric ///    from the 128-bit vector of [4 x i32] in \a i.
518406c3fb27SDimitry Andric ///
518506c3fb27SDimitry Andric /// \code{.operation}
518606c3fb27SDimitry Andric /// FOR element := 0 to 3
518706c3fb27SDimitry Andric ///   j := element*64
518806c3fb27SDimitry Andric ///   k := element*32
518906c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
519006c3fb27SDimitry Andric /// ENDFOR
519106c3fb27SDimitry Andric /// \endcode
519206c3fb27SDimitry Andric ///
519306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
519406c3fb27SDimitry Andric ///
519506c3fb27SDimitry Andric /// \code
519606c3fb27SDimitry Andric /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
519706c3fb27SDimitry Andric /// \endcode
519806c3fb27SDimitry Andric ///
519906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
520006c3fb27SDimitry Andric ///
520106c3fb27SDimitry Andric /// \param m
520206c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
520306c3fb27SDimitry Andric /// \param i
520406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
520506c3fb27SDimitry Andric /// \param s
520606c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
520706c3fb27SDimitry Andric ///    1, 2, 4, or 8.
520806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
52090b57cec5SDimitry Andric #define _mm256_i32gather_epi64(m, i, s) \
5210349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
52110b57cec5SDimitry Andric                                         (long long const *)(m), \
52120b57cec5SDimitry Andric                                         (__v4si)(__m128i)(i), \
5213349cc55cSDimitry Andric                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
52140b57cec5SDimitry Andric 
521506c3fb27SDimitry Andric /// Gathers two 64-bit integer values from memory \a m using scaled indexes
521606c3fb27SDimitry Andric ///    from the 128-bit vector of [2 x i64] in \a i.
521706c3fb27SDimitry Andric ///
521806c3fb27SDimitry Andric /// \code{.operation}
521906c3fb27SDimitry Andric /// FOR element := 0 to 1
522006c3fb27SDimitry Andric ///   j := element*64
522106c3fb27SDimitry Andric ///   k := element*64
522206c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
522306c3fb27SDimitry Andric /// ENDFOR
522406c3fb27SDimitry Andric /// \endcode
522506c3fb27SDimitry Andric ///
522606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
522706c3fb27SDimitry Andric ///
522806c3fb27SDimitry Andric /// \code
522906c3fb27SDimitry Andric /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
523006c3fb27SDimitry Andric /// \endcode
523106c3fb27SDimitry Andric ///
523206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
523306c3fb27SDimitry Andric ///
523406c3fb27SDimitry Andric /// \param m
523506c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
523606c3fb27SDimitry Andric /// \param i
523706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
523806c3fb27SDimitry Andric /// \param s
523906c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
524006c3fb27SDimitry Andric ///    1, 2, 4, or 8.
524106c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
52420b57cec5SDimitry Andric #define _mm_i64gather_epi64(m, i, s) \
5243349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
52440b57cec5SDimitry Andric                                      (long long const *)(m), \
52450b57cec5SDimitry Andric                                      (__v2di)(__m128i)(i), \
5246349cc55cSDimitry Andric                                      (__v2di)_mm_set1_epi64x(-1), (s)))
52470b57cec5SDimitry Andric 
524806c3fb27SDimitry Andric /// Gathers four 64-bit integer values from memory \a m using scaled indexes
524906c3fb27SDimitry Andric ///    from the 256-bit vector of [4 x i64] in \a i.
525006c3fb27SDimitry Andric ///
525106c3fb27SDimitry Andric /// \code{.operation}
525206c3fb27SDimitry Andric /// FOR element := 0 to 3
525306c3fb27SDimitry Andric ///   j := element*64
525406c3fb27SDimitry Andric ///   k := element*64
525506c3fb27SDimitry Andric ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
525606c3fb27SDimitry Andric /// ENDFOR
525706c3fb27SDimitry Andric /// \endcode
525806c3fb27SDimitry Andric ///
525906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
526006c3fb27SDimitry Andric ///
526106c3fb27SDimitry Andric /// \code
526206c3fb27SDimitry Andric /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
526306c3fb27SDimitry Andric /// \endcode
526406c3fb27SDimitry Andric ///
526506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
526606c3fb27SDimitry Andric ///
526706c3fb27SDimitry Andric /// \param m
526806c3fb27SDimitry Andric ///    A pointer to the memory used for loading values.
526906c3fb27SDimitry Andric /// \param i
527006c3fb27SDimitry Andric ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
527106c3fb27SDimitry Andric /// \param s
527206c3fb27SDimitry Andric ///    A literal constant scale factor for the indexes in \a i. Must be
527306c3fb27SDimitry Andric ///    1, 2, 4, or 8.
527406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
52750b57cec5SDimitry Andric #define _mm256_i64gather_epi64(m, i, s) \
5276349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
52770b57cec5SDimitry Andric                                         (long long const *)(m), \
52780b57cec5SDimitry Andric                                         (__v4di)(__m256i)(i), \
5279349cc55cSDimitry Andric                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
52800b57cec5SDimitry Andric 
52810b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256
52820b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128
52830b57cec5SDimitry Andric 
52840b57cec5SDimitry Andric #endif /* __AVX2INTRIN_H */
5285