10b57cec5SDimitry Andric /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 20b57cec5SDimitry Andric * 30b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric * 70b57cec5SDimitry Andric *===-----------------------------------------------------------------------=== 80b57cec5SDimitry Andric */ 90b57cec5SDimitry Andric 100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H 110b57cec5SDimitry Andric #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 120b57cec5SDimitry Andric #endif 130b57cec5SDimitry Andric 140b57cec5SDimitry Andric #ifndef __AVX2INTRIN_H 150b57cec5SDimitry Andric #define __AVX2INTRIN_H 160b57cec5SDimitry Andric 170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */ 18*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS256 \ 19*5f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, \ 20*5f757f3fSDimitry Andric __target__("avx2,no-evex512"), __min_vector_width__(256))) 21*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS128 \ 22*5f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, \ 23*5f757f3fSDimitry Andric __target__("avx2,no-evex512"), __min_vector_width__(128))) 240b57cec5SDimitry Andric 250b57cec5SDimitry Andric /* SSE4 Multiple Packed Sums of Absolute Difference. */ 2606c3fb27SDimitry Andric /// Computes sixteen sum of absolute difference (SAD) operations on sets of 2706c3fb27SDimitry Andric /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and 2806c3fb27SDimitry Andric /// \a Y. 2906c3fb27SDimitry Andric /// 3006c3fb27SDimitry Andric /// Eight SAD results are computed using the lower half of the input 3106c3fb27SDimitry Andric /// vectors, and another eight using the upper half. These 16-bit values 3206c3fb27SDimitry Andric /// are returned in the lower and upper halves of the 256-bit result, 3306c3fb27SDimitry Andric /// respectively. 3406c3fb27SDimitry Andric /// 3506c3fb27SDimitry Andric /// A single SAD operation selects four bytes from \a X and four bytes from 3606c3fb27SDimitry Andric /// \a Y as input. It computes the differences between each \a X byte and 3706c3fb27SDimitry Andric /// the corresponding \a Y byte, takes the absolute value of each 3806c3fb27SDimitry Andric /// difference, and sums these four values to form one 16-bit result. The 3906c3fb27SDimitry Andric /// intrinsic computes 16 of these results with different sets of input 4006c3fb27SDimitry Andric /// bytes. 4106c3fb27SDimitry Andric /// 4206c3fb27SDimitry Andric /// For each set of eight results, the SAD operations use the same four 4306c3fb27SDimitry Andric /// bytes from \a Y; the starting bit position for these four bytes is 4406c3fb27SDimitry Andric /// specified by \a M[1:0] times 32. The eight operations use successive 4506c3fb27SDimitry Andric /// sets of four bytes from \a X; the starting bit position for the first 4606c3fb27SDimitry Andric /// set of four bytes is specified by \a M[2] times 32. These bit positions 4706c3fb27SDimitry Andric /// are all relative to the 128-bit lane for each set of eight operations. 4806c3fb27SDimitry Andric /// 4906c3fb27SDimitry Andric /// \code{.operation} 5006c3fb27SDimitry Andric /// r := 0 5106c3fb27SDimitry Andric /// FOR i := 0 TO 1 5206c3fb27SDimitry Andric /// j := i*3 5306c3fb27SDimitry Andric /// Ybase := M[j+1:j]*32 + i*128 5406c3fb27SDimitry Andric /// Xbase := M[j+2]*32 + i*128 5506c3fb27SDimitry Andric /// FOR k := 0 TO 3 5606c3fb27SDimitry Andric /// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase]) 5706c3fb27SDimitry Andric /// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8]) 5806c3fb27SDimitry Andric /// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16]) 5906c3fb27SDimitry Andric /// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24]) 6006c3fb27SDimitry Andric /// result[r+15:r] := temp0 + temp1 + temp2 + temp3 6106c3fb27SDimitry Andric /// Xbase := Xbase + 8 6206c3fb27SDimitry Andric /// r := r + 16 6306c3fb27SDimitry Andric /// ENDFOR 6406c3fb27SDimitry Andric /// ENDFOR 6506c3fb27SDimitry Andric /// \endcode 6606c3fb27SDimitry Andric /// 6706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 6806c3fb27SDimitry Andric /// 6906c3fb27SDimitry Andric /// \code 7006c3fb27SDimitry Andric /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M); 7106c3fb27SDimitry Andric /// \endcode 7206c3fb27SDimitry Andric /// 7306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VMPSADBW instruction. 7406c3fb27SDimitry Andric /// 7506c3fb27SDimitry Andric /// \param X 7606c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the inputs. 7706c3fb27SDimitry Andric /// \param Y 7806c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the inputs. 7906c3fb27SDimitry Andric /// \param M 8006c3fb27SDimitry Andric /// An unsigned immediate value specifying the starting positions of the 8106c3fb27SDimitry Andric /// bytes to operate on. 8206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 830b57cec5SDimitry Andric #define _mm256_mpsadbw_epu8(X, Y, M) \ 84349cc55cSDimitry Andric ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ 85349cc55cSDimitry Andric (__v32qi)(__m256i)(Y), (int)(M))) 860b57cec5SDimitry Andric 8706c3fb27SDimitry Andric /// Computes the absolute value of each signed byte in the 256-bit integer 8806c3fb27SDimitry Andric /// vector \a __a and returns each value in the corresponding byte of 8906c3fb27SDimitry Andric /// the result. 9006c3fb27SDimitry Andric /// 9106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 9206c3fb27SDimitry Andric /// 9306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPABSB instruction. 9406c3fb27SDimitry Andric /// 9506c3fb27SDimitry Andric /// \param __a 9606c3fb27SDimitry Andric /// A 256-bit integer vector. 9706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 980b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 990b57cec5SDimitry Andric _mm256_abs_epi8(__m256i __a) 1000b57cec5SDimitry Andric { 10104eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_abs((__v32qs)__a); 1020b57cec5SDimitry Andric } 1030b57cec5SDimitry Andric 10406c3fb27SDimitry Andric /// Computes the absolute value of each signed 16-bit element in the 256-bit 10506c3fb27SDimitry Andric /// vector of [16 x i16] in \a __a and returns each value in the 10606c3fb27SDimitry Andric /// corresponding element of the result. 10706c3fb27SDimitry Andric /// 10806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 10906c3fb27SDimitry Andric /// 11006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPABSW instruction. 11106c3fb27SDimitry Andric /// 11206c3fb27SDimitry Andric /// \param __a 11306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 11406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 1150b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1160b57cec5SDimitry Andric _mm256_abs_epi16(__m256i __a) 1170b57cec5SDimitry Andric { 11804eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_abs((__v16hi)__a); 1190b57cec5SDimitry Andric } 1200b57cec5SDimitry Andric 12106c3fb27SDimitry Andric /// Computes the absolute value of each signed 32-bit element in the 256-bit 12206c3fb27SDimitry Andric /// vector of [8 x i32] in \a __a and returns each value in the 12306c3fb27SDimitry Andric /// corresponding element of the result. 12406c3fb27SDimitry Andric /// 12506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 12606c3fb27SDimitry Andric /// 12706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPABSD instruction. 12806c3fb27SDimitry Andric /// 12906c3fb27SDimitry Andric /// \param __a 13006c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 13106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 1320b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1330b57cec5SDimitry Andric _mm256_abs_epi32(__m256i __a) 1340b57cec5SDimitry Andric { 13504eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_abs((__v8si)__a); 1360b57cec5SDimitry Andric } 1370b57cec5SDimitry Andric 13806c3fb27SDimitry Andric /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit 13906c3fb27SDimitry Andric /// integers using signed saturation, and returns the 256-bit result. 14006c3fb27SDimitry Andric /// 14106c3fb27SDimitry Andric /// \code{.operation} 14206c3fb27SDimitry Andric /// FOR i := 0 TO 7 14306c3fb27SDimitry Andric /// j := i*16 14406c3fb27SDimitry Andric /// k := i*8 14506c3fb27SDimitry Andric /// result[7+k:k] := SATURATE8(__a[15+j:j]) 14606c3fb27SDimitry Andric /// result[71+k:64+k] := SATURATE8(__b[15+j:j]) 14706c3fb27SDimitry Andric /// result[135+k:128+k] := SATURATE8(__a[143+j:128+j]) 14806c3fb27SDimitry Andric /// result[199+k:192+k] := SATURATE8(__b[143+j:128+j]) 14906c3fb27SDimitry Andric /// ENDFOR 15006c3fb27SDimitry Andric /// \endcode 15106c3fb27SDimitry Andric /// 15206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 15306c3fb27SDimitry Andric /// 15406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKSSWB instruction. 15506c3fb27SDimitry Andric /// 15606c3fb27SDimitry Andric /// \param __a 15706c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 15806c3fb27SDimitry Andric /// result[191:128]. 15906c3fb27SDimitry Andric /// \param __b 16006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 16106c3fb27SDimitry Andric /// result[255:192]. 16206c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 1630b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1640b57cec5SDimitry Andric _mm256_packs_epi16(__m256i __a, __m256i __b) 1650b57cec5SDimitry Andric { 1660b57cec5SDimitry Andric return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 1670b57cec5SDimitry Andric } 1680b57cec5SDimitry Andric 16906c3fb27SDimitry Andric /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit 17006c3fb27SDimitry Andric /// integers using signed saturation, and returns the resulting 256-bit 17106c3fb27SDimitry Andric /// vector of [16 x i16]. 17206c3fb27SDimitry Andric /// 17306c3fb27SDimitry Andric /// \code{.operation} 17406c3fb27SDimitry Andric /// FOR i := 0 TO 3 17506c3fb27SDimitry Andric /// j := i*32 17606c3fb27SDimitry Andric /// k := i*16 17706c3fb27SDimitry Andric /// result[15+k:k] := SATURATE16(__a[31+j:j]) 17806c3fb27SDimitry Andric /// result[79+k:64+k] := SATURATE16(__b[31+j:j]) 17906c3fb27SDimitry Andric /// result[143+k:128+k] := SATURATE16(__a[159+j:128+j]) 18006c3fb27SDimitry Andric /// result[207+k:192+k] := SATURATE16(__b[159+j:128+j]) 18106c3fb27SDimitry Andric /// ENDFOR 18206c3fb27SDimitry Andric /// \endcode 18306c3fb27SDimitry Andric /// 18406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 18506c3fb27SDimitry Andric /// 18606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKSSDW instruction. 18706c3fb27SDimitry Andric /// 18806c3fb27SDimitry Andric /// \param __a 18906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 19006c3fb27SDimitry Andric /// result[191:128]. 19106c3fb27SDimitry Andric /// \param __b 19206c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 19306c3fb27SDimitry Andric /// result[255:192]. 19406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 1950b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1960b57cec5SDimitry Andric _mm256_packs_epi32(__m256i __a, __m256i __b) 1970b57cec5SDimitry Andric { 1980b57cec5SDimitry Andric return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 1990b57cec5SDimitry Andric } 2000b57cec5SDimitry Andric 20106c3fb27SDimitry Andric /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers 20206c3fb27SDimitry Andric /// using unsigned saturation, and returns the 256-bit result. 20306c3fb27SDimitry Andric /// 20406c3fb27SDimitry Andric /// \code{.operation} 20506c3fb27SDimitry Andric /// FOR i := 0 TO 7 20606c3fb27SDimitry Andric /// j := i*16 20706c3fb27SDimitry Andric /// k := i*8 20806c3fb27SDimitry Andric /// result[7+k:k] := SATURATE8U(__a[15+j:j]) 20906c3fb27SDimitry Andric /// result[71+k:64+k] := SATURATE8U(__b[15+j:j]) 21006c3fb27SDimitry Andric /// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j]) 21106c3fb27SDimitry Andric /// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j]) 21206c3fb27SDimitry Andric /// ENDFOR 21306c3fb27SDimitry Andric /// \endcode 21406c3fb27SDimitry Andric /// 21506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 21606c3fb27SDimitry Andric /// 21706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKUSWB instruction. 21806c3fb27SDimitry Andric /// 21906c3fb27SDimitry Andric /// \param __a 22006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used to generate result[63:0] and 22106c3fb27SDimitry Andric /// result[191:128]. 22206c3fb27SDimitry Andric /// \param __b 22306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used to generate result[127:64] and 22406c3fb27SDimitry Andric /// result[255:192]. 22506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 2260b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2270b57cec5SDimitry Andric _mm256_packus_epi16(__m256i __a, __m256i __b) 2280b57cec5SDimitry Andric { 2290b57cec5SDimitry Andric return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 2300b57cec5SDimitry Andric } 2310b57cec5SDimitry Andric 23206c3fb27SDimitry Andric /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers 23306c3fb27SDimitry Andric /// using unsigned saturation, and returns the resulting 256-bit vector of 23406c3fb27SDimitry Andric /// [16 x i16]. 23506c3fb27SDimitry Andric /// 23606c3fb27SDimitry Andric /// \code{.operation} 23706c3fb27SDimitry Andric /// FOR i := 0 TO 3 23806c3fb27SDimitry Andric /// j := i*32 23906c3fb27SDimitry Andric /// k := i*16 24006c3fb27SDimitry Andric /// result[15+k:k] := SATURATE16U(__V1[31+j:j]) 24106c3fb27SDimitry Andric /// result[79+k:64+k] := SATURATE16U(__V2[31+j:j]) 24206c3fb27SDimitry Andric /// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j]) 24306c3fb27SDimitry Andric /// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j]) 24406c3fb27SDimitry Andric /// ENDFOR 24506c3fb27SDimitry Andric /// \endcode 24606c3fb27SDimitry Andric /// 24706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 24806c3fb27SDimitry Andric /// 24906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPACKUSDW instruction. 25006c3fb27SDimitry Andric /// 25106c3fb27SDimitry Andric /// \param __V1 25206c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used to generate result[63:0] and 25306c3fb27SDimitry Andric /// result[191:128]. 25406c3fb27SDimitry Andric /// \param __V2 25506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used to generate result[127:64] and 25606c3fb27SDimitry Andric /// result[255:192]. 25706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 2580b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2590b57cec5SDimitry Andric _mm256_packus_epi32(__m256i __V1, __m256i __V2) 2600b57cec5SDimitry Andric { 2610b57cec5SDimitry Andric return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 2620b57cec5SDimitry Andric } 2630b57cec5SDimitry Andric 26406c3fb27SDimitry Andric /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 26506c3fb27SDimitry Andric /// vectors and returns the lower 8 bits of each sum in the corresponding 26606c3fb27SDimitry Andric /// byte of the 256-bit integer vector result (overflow is ignored). 26706c3fb27SDimitry Andric /// 26806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 26906c3fb27SDimitry Andric /// 27006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDB instruction. 27106c3fb27SDimitry Andric /// 27206c3fb27SDimitry Andric /// \param __a 27306c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the source operands. 27406c3fb27SDimitry Andric /// \param __b 27506c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the source operands. 27606c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the sums. 2770b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2780b57cec5SDimitry Andric _mm256_add_epi8(__m256i __a, __m256i __b) 2790b57cec5SDimitry Andric { 2800b57cec5SDimitry Andric return (__m256i)((__v32qu)__a + (__v32qu)__b); 2810b57cec5SDimitry Andric } 2820b57cec5SDimitry Andric 28306c3fb27SDimitry Andric /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 28406c3fb27SDimitry Andric /// [16 x i16] and returns the lower 16 bits of each sum in the 28506c3fb27SDimitry Andric /// corresponding element of the [16 x i16] result (overflow is ignored). 28606c3fb27SDimitry Andric /// 28706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 28806c3fb27SDimitry Andric /// 28906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDW instruction. 29006c3fb27SDimitry Andric /// 29106c3fb27SDimitry Andric /// \param __a 29206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 29306c3fb27SDimitry Andric /// \param __b 29406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 29506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums. 2960b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2970b57cec5SDimitry Andric _mm256_add_epi16(__m256i __a, __m256i __b) 2980b57cec5SDimitry Andric { 2990b57cec5SDimitry Andric return (__m256i)((__v16hu)__a + (__v16hu)__b); 3000b57cec5SDimitry Andric } 3010b57cec5SDimitry Andric 30206c3fb27SDimitry Andric /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of 30306c3fb27SDimitry Andric /// [8 x i32] and returns the lower 32 bits of each sum in the corresponding 30406c3fb27SDimitry Andric /// element of the [8 x i32] result (overflow is ignored). 30506c3fb27SDimitry Andric /// 30606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 30706c3fb27SDimitry Andric /// 30806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDD instruction. 30906c3fb27SDimitry Andric /// 31006c3fb27SDimitry Andric /// \param __a 31106c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 31206c3fb27SDimitry Andric /// \param __b 31306c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 31406c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sums. 3150b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 3160b57cec5SDimitry Andric _mm256_add_epi32(__m256i __a, __m256i __b) 3170b57cec5SDimitry Andric { 3180b57cec5SDimitry Andric return (__m256i)((__v8su)__a + (__v8su)__b); 3190b57cec5SDimitry Andric } 3200b57cec5SDimitry Andric 32106c3fb27SDimitry Andric /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of 32206c3fb27SDimitry Andric /// [4 x i64] and returns the lower 64 bits of each sum in the corresponding 32306c3fb27SDimitry Andric /// element of the [4 x i64] result (overflow is ignored). 32406c3fb27SDimitry Andric /// 32506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 32606c3fb27SDimitry Andric /// 32706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDQ instruction. 32806c3fb27SDimitry Andric /// 32906c3fb27SDimitry Andric /// \param __a 33006c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing one of the source operands. 33106c3fb27SDimitry Andric /// \param __b 33206c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing one of the source operands. 33306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sums. 3340b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 3350b57cec5SDimitry Andric _mm256_add_epi64(__m256i __a, __m256i __b) 3360b57cec5SDimitry Andric { 3370b57cec5SDimitry Andric return (__m256i)((__v4du)__a + (__v4du)__b); 3380b57cec5SDimitry Andric } 3390b57cec5SDimitry Andric 34006c3fb27SDimitry Andric /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 34106c3fb27SDimitry Andric /// vectors using signed saturation, and returns each sum in the 34206c3fb27SDimitry Andric /// corresponding byte of the 256-bit integer vector result. 34306c3fb27SDimitry Andric /// 34406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 34506c3fb27SDimitry Andric /// 34606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDSB instruction. 34706c3fb27SDimitry Andric /// 34806c3fb27SDimitry Andric /// \param __a 34906c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the source operands. 35006c3fb27SDimitry Andric /// \param __b 35106c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the source operands. 35206c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the sums. 3530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 3540b57cec5SDimitry Andric _mm256_adds_epi8(__m256i __a, __m256i __b) 3550b57cec5SDimitry Andric { 35681ad6265SDimitry Andric return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); 3570b57cec5SDimitry Andric } 3580b57cec5SDimitry Andric 35906c3fb27SDimitry Andric /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 36006c3fb27SDimitry Andric /// [16 x i16] using signed saturation, and returns the [16 x i16] result. 36106c3fb27SDimitry Andric /// 36206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 36306c3fb27SDimitry Andric /// 36406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDSW instruction. 36506c3fb27SDimitry Andric /// 36606c3fb27SDimitry Andric /// \param __a 36706c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 36806c3fb27SDimitry Andric /// \param __b 36906c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 37006c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums. 3710b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 3720b57cec5SDimitry Andric _mm256_adds_epi16(__m256i __a, __m256i __b) 3730b57cec5SDimitry Andric { 37481ad6265SDimitry Andric return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); 3750b57cec5SDimitry Andric } 3760b57cec5SDimitry Andric 37706c3fb27SDimitry Andric /// Adds 8-bit integers from corresponding bytes of two 256-bit integer 37806c3fb27SDimitry Andric /// vectors using unsigned saturation, and returns each sum in the 37906c3fb27SDimitry Andric /// corresponding byte of the 256-bit integer vector result. 38006c3fb27SDimitry Andric /// 38106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 38206c3fb27SDimitry Andric /// 38306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDUSB instruction. 38406c3fb27SDimitry Andric /// 38506c3fb27SDimitry Andric /// \param __a 38606c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the source operands. 38706c3fb27SDimitry Andric /// \param __b 38806c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the source operands. 38906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the sums. 3900b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 3910b57cec5SDimitry Andric _mm256_adds_epu8(__m256i __a, __m256i __b) 3920b57cec5SDimitry Andric { 39381ad6265SDimitry Andric return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); 3940b57cec5SDimitry Andric } 3950b57cec5SDimitry Andric 39606c3fb27SDimitry Andric /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of 39706c3fb27SDimitry Andric /// [16 x i16] using unsigned saturation, and returns the [16 x i16] result. 39806c3fb27SDimitry Andric /// 39906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 40006c3fb27SDimitry Andric /// 40106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPADDUSW instruction. 40206c3fb27SDimitry Andric /// 40306c3fb27SDimitry Andric /// \param __a 40406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 40506c3fb27SDimitry Andric /// \param __b 40606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 40706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums. 4080b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 4090b57cec5SDimitry Andric _mm256_adds_epu16(__m256i __a, __m256i __b) 4100b57cec5SDimitry Andric { 41181ad6265SDimitry Andric return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); 4120b57cec5SDimitry Andric } 4130b57cec5SDimitry Andric 41406c3fb27SDimitry Andric /// Uses the lower half of the 256-bit vector \a a as the upper half of a 41506c3fb27SDimitry Andric /// temporary 256-bit value, and the lower half of the 256-bit vector \a b 41606c3fb27SDimitry Andric /// as the lower half of the temporary value. Right-shifts the temporary 41706c3fb27SDimitry Andric /// value by \a n bytes, and uses the lower 16 bytes of the shifted value 41806c3fb27SDimitry Andric /// as the lower 16 bytes of the result. Uses the upper halves of \a a and 41906c3fb27SDimitry Andric /// \a b to make another temporary value, right shifts by \a n, and uses 42006c3fb27SDimitry Andric /// the lower 16 bytes of the shifted value as the upper 16 bytes of the 42106c3fb27SDimitry Andric /// result. 42206c3fb27SDimitry Andric /// 42306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 42406c3fb27SDimitry Andric /// 42506c3fb27SDimitry Andric /// \code 42606c3fb27SDimitry Andric /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n); 42706c3fb27SDimitry Andric /// \endcode 42806c3fb27SDimitry Andric /// 42906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPALIGNR instruction. 43006c3fb27SDimitry Andric /// 43106c3fb27SDimitry Andric /// \param a 43206c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 43306c3fb27SDimitry Andric /// \param b 43406c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 43506c3fb27SDimitry Andric /// \param n 43606c3fb27SDimitry Andric /// An immediate value specifying the number of bytes to shift. 43706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 4380b57cec5SDimitry Andric #define _mm256_alignr_epi8(a, b, n) \ 439349cc55cSDimitry Andric ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ 440349cc55cSDimitry Andric (__v32qi)(__m256i)(b), (n))) 4410b57cec5SDimitry Andric 44206c3fb27SDimitry Andric /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and 44306c3fb27SDimitry Andric /// \a __b. 44406c3fb27SDimitry Andric /// 44506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 44606c3fb27SDimitry Andric /// 44706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPAND instruction. 44806c3fb27SDimitry Andric /// 44906c3fb27SDimitry Andric /// \param __a 45006c3fb27SDimitry Andric /// A 256-bit integer vector. 45106c3fb27SDimitry Andric /// \param __b 45206c3fb27SDimitry Andric /// A 256-bit integer vector. 45306c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 4540b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 4550b57cec5SDimitry Andric _mm256_and_si256(__m256i __a, __m256i __b) 4560b57cec5SDimitry Andric { 4570b57cec5SDimitry Andric return (__m256i)((__v4du)__a & (__v4du)__b); 4580b57cec5SDimitry Andric } 4590b57cec5SDimitry Andric 46006c3fb27SDimitry Andric /// Computes the bitwise AND of the 256-bit integer vector in \a __b with 46106c3fb27SDimitry Andric /// the bitwise NOT of the 256-bit integer vector in \a __a. 46206c3fb27SDimitry Andric /// 46306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 46406c3fb27SDimitry Andric /// 46506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPANDN instruction. 46606c3fb27SDimitry Andric /// 46706c3fb27SDimitry Andric /// \param __a 46806c3fb27SDimitry Andric /// A 256-bit integer vector. 46906c3fb27SDimitry Andric /// \param __b 47006c3fb27SDimitry Andric /// A 256-bit integer vector. 47106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 4720b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 4730b57cec5SDimitry Andric _mm256_andnot_si256(__m256i __a, __m256i __b) 4740b57cec5SDimitry Andric { 4750b57cec5SDimitry Andric return (__m256i)(~(__v4du)__a & (__v4du)__b); 4760b57cec5SDimitry Andric } 4770b57cec5SDimitry Andric 47806c3fb27SDimitry Andric /// Computes the averages of the corresponding unsigned bytes in the two 47906c3fb27SDimitry Andric /// 256-bit integer vectors in \a __a and \a __b and returns each 48006c3fb27SDimitry Andric /// average in the corresponding byte of the 256-bit result. 48106c3fb27SDimitry Andric /// 48206c3fb27SDimitry Andric /// \code{.operation} 48306c3fb27SDimitry Andric /// FOR i := 0 TO 31 48406c3fb27SDimitry Andric /// j := i*8 48506c3fb27SDimitry Andric /// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1 48606c3fb27SDimitry Andric /// ENDFOR 48706c3fb27SDimitry Andric /// \endcode 48806c3fb27SDimitry Andric /// 48906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 49006c3fb27SDimitry Andric /// 49106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPAVGB instruction. 49206c3fb27SDimitry Andric /// 49306c3fb27SDimitry Andric /// \param __a 49406c3fb27SDimitry Andric /// A 256-bit integer vector. 49506c3fb27SDimitry Andric /// \param __b 49606c3fb27SDimitry Andric /// A 256-bit integer vector. 49706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 4980b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 4990b57cec5SDimitry Andric _mm256_avg_epu8(__m256i __a, __m256i __b) 5000b57cec5SDimitry Andric { 5010b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); 5020b57cec5SDimitry Andric } 5030b57cec5SDimitry Andric 50406c3fb27SDimitry Andric /// Computes the averages of the corresponding unsigned 16-bit integers in 50506c3fb27SDimitry Andric /// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns 50606c3fb27SDimitry Andric /// each average in the corresponding element of the 256-bit result. 50706c3fb27SDimitry Andric /// 50806c3fb27SDimitry Andric /// \code{.operation} 50906c3fb27SDimitry Andric /// FOR i := 0 TO 15 51006c3fb27SDimitry Andric /// j := i*16 51106c3fb27SDimitry Andric /// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1 51206c3fb27SDimitry Andric /// ENDFOR 51306c3fb27SDimitry Andric /// \endcode 51406c3fb27SDimitry Andric /// 51506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 51606c3fb27SDimitry Andric /// 51706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPAVGW instruction. 51806c3fb27SDimitry Andric /// 51906c3fb27SDimitry Andric /// \param __a 52006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 52106c3fb27SDimitry Andric /// \param __b 52206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 52306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 5240b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 5250b57cec5SDimitry Andric _mm256_avg_epu16(__m256i __a, __m256i __b) 5260b57cec5SDimitry Andric { 5270b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); 5280b57cec5SDimitry Andric } 5290b57cec5SDimitry Andric 53006c3fb27SDimitry Andric /// Merges 8-bit integer values from either of the two 256-bit vectors 53106c3fb27SDimitry Andric /// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns 53206c3fb27SDimitry Andric /// the resulting 256-bit integer vector. 53306c3fb27SDimitry Andric /// 53406c3fb27SDimitry Andric /// \code{.operation} 53506c3fb27SDimitry Andric /// FOR i := 0 TO 31 53606c3fb27SDimitry Andric /// j := i*8 53706c3fb27SDimitry Andric /// IF __M[7+i] == 0 53806c3fb27SDimitry Andric /// result[7+j:j] := __V1[7+j:j] 53906c3fb27SDimitry Andric /// ELSE 54006c3fb27SDimitry Andric /// result[7+j:j] := __V2[7+j:j] 54106c3fb27SDimitry Andric /// FI 54206c3fb27SDimitry Andric /// ENDFOR 54306c3fb27SDimitry Andric /// \endcode 54406c3fb27SDimitry Andric /// 54506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 54606c3fb27SDimitry Andric /// 54706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDVB instruction. 54806c3fb27SDimitry Andric /// 54906c3fb27SDimitry Andric /// \param __V1 55006c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 55106c3fb27SDimitry Andric /// \param __V2 55206c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 55306c3fb27SDimitry Andric /// \param __M 55406c3fb27SDimitry Andric /// A 256-bit integer vector, with bit [7] of each byte specifying the 55506c3fb27SDimitry Andric /// source for each corresponding byte of the result. When the mask bit 55606c3fb27SDimitry Andric /// is 0, the byte is copied from \a __V1; otherwise, it is copied from 55706c3fb27SDimitry Andric /// \a __V2. 55806c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 5590b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 5600b57cec5SDimitry Andric _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 5610b57cec5SDimitry Andric { 5620b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 5630b57cec5SDimitry Andric (__v32qi)__M); 5640b57cec5SDimitry Andric } 5650b57cec5SDimitry Andric 56606c3fb27SDimitry Andric /// Merges 16-bit integer values from either of the two 256-bit vectors 56706c3fb27SDimitry Andric /// \a V1 or \a V2, as specified by the immediate integer operand \a M, 56806c3fb27SDimitry Andric /// and returns the resulting 256-bit vector of [16 x i16]. 56906c3fb27SDimitry Andric /// 57006c3fb27SDimitry Andric /// \code{.operation} 57106c3fb27SDimitry Andric /// FOR i := 0 TO 7 57206c3fb27SDimitry Andric /// j := i*16 57306c3fb27SDimitry Andric /// IF M[i] == 0 57406c3fb27SDimitry Andric /// result[7+j:j] := V1[7+j:j] 57506c3fb27SDimitry Andric /// result[135+j:128+j] := V1[135+j:128+j] 57606c3fb27SDimitry Andric /// ELSE 57706c3fb27SDimitry Andric /// result[7+j:j] := V2[7+j:j] 57806c3fb27SDimitry Andric /// result[135+j:128+j] := V2[135+j:128+j] 57906c3fb27SDimitry Andric /// FI 58006c3fb27SDimitry Andric /// ENDFOR 58106c3fb27SDimitry Andric /// \endcode 58206c3fb27SDimitry Andric /// 58306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 58406c3fb27SDimitry Andric /// 58506c3fb27SDimitry Andric /// \code 58606c3fb27SDimitry Andric /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M); 58706c3fb27SDimitry Andric /// \endcode 58806c3fb27SDimitry Andric /// 58906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDW instruction. 59006c3fb27SDimitry Andric /// 59106c3fb27SDimitry Andric /// \param V1 59206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing source values. 59306c3fb27SDimitry Andric /// \param V2 59406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing source values. 59506c3fb27SDimitry Andric /// \param M 59606c3fb27SDimitry Andric /// An immediate 8-bit integer operand, with bits [7:0] specifying the 59706c3fb27SDimitry Andric /// source for each element of the result. The position of the mask bit 59806c3fb27SDimitry Andric /// corresponds to the index of a copied value. When a mask bit is 0, the 59906c3fb27SDimitry Andric /// element is copied from \a V1; otherwise, it is copied from \a V2. 60006c3fb27SDimitry Andric /// \a M[0] determines the source for elements 0 and 8, \a M[1] for 60106c3fb27SDimitry Andric /// elements 1 and 9, and so forth. 60206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 6030b57cec5SDimitry Andric #define _mm256_blend_epi16(V1, V2, M) \ 604349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ 605349cc55cSDimitry Andric (__v16hi)(__m256i)(V2), (int)(M))) 6060b57cec5SDimitry Andric 60706c3fb27SDimitry Andric /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and 60806c3fb27SDimitry Andric /// \a __b for equality and returns the outcomes in the corresponding 60906c3fb27SDimitry Andric /// bytes of the 256-bit result. 61006c3fb27SDimitry Andric /// 61106c3fb27SDimitry Andric /// \code{.operation} 61206c3fb27SDimitry Andric /// FOR i := 0 TO 31 61306c3fb27SDimitry Andric /// j := i*8 61406c3fb27SDimitry Andric /// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0 61506c3fb27SDimitry Andric /// ENDFOR 61606c3fb27SDimitry Andric /// \endcode 61706c3fb27SDimitry Andric /// 61806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 61906c3fb27SDimitry Andric /// 62006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQB instruction. 62106c3fb27SDimitry Andric /// 62206c3fb27SDimitry Andric /// \param __a 62306c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the inputs. 62406c3fb27SDimitry Andric /// \param __b 62506c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the inputs. 62606c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 6270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 6280b57cec5SDimitry Andric _mm256_cmpeq_epi8(__m256i __a, __m256i __b) 6290b57cec5SDimitry Andric { 6300b57cec5SDimitry Andric return (__m256i)((__v32qi)__a == (__v32qi)__b); 6310b57cec5SDimitry Andric } 6320b57cec5SDimitry Andric 63306c3fb27SDimitry Andric /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in 63406c3fb27SDimitry Andric /// \a __a and \a __b for equality and returns the outcomes in the 63506c3fb27SDimitry Andric /// corresponding elements of the 256-bit result. 63606c3fb27SDimitry Andric /// 63706c3fb27SDimitry Andric /// \code{.operation} 63806c3fb27SDimitry Andric /// FOR i := 0 TO 15 63906c3fb27SDimitry Andric /// j := i*16 64006c3fb27SDimitry Andric /// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0 64106c3fb27SDimitry Andric /// ENDFOR 64206c3fb27SDimitry Andric /// \endcode 64306c3fb27SDimitry Andric /// 64406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 64506c3fb27SDimitry Andric /// 64606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQW instruction. 64706c3fb27SDimitry Andric /// 64806c3fb27SDimitry Andric /// \param __a 64906c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the inputs. 65006c3fb27SDimitry Andric /// \param __b 65106c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the inputs. 65206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 6530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 6540b57cec5SDimitry Andric _mm256_cmpeq_epi16(__m256i __a, __m256i __b) 6550b57cec5SDimitry Andric { 6560b57cec5SDimitry Andric return (__m256i)((__v16hi)__a == (__v16hi)__b); 6570b57cec5SDimitry Andric } 6580b57cec5SDimitry Andric 65906c3fb27SDimitry Andric /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in 66006c3fb27SDimitry Andric /// \a __a and \a __b for equality and returns the outcomes in the 66106c3fb27SDimitry Andric /// corresponding elements of the 256-bit result. 66206c3fb27SDimitry Andric /// 66306c3fb27SDimitry Andric /// \code{.operation} 66406c3fb27SDimitry Andric /// FOR i := 0 TO 7 66506c3fb27SDimitry Andric /// j := i*32 66606c3fb27SDimitry Andric /// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0 66706c3fb27SDimitry Andric /// ENDFOR 66806c3fb27SDimitry Andric /// \endcode 66906c3fb27SDimitry Andric /// 67006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 67106c3fb27SDimitry Andric /// 67206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQD instruction. 67306c3fb27SDimitry Andric /// 67406c3fb27SDimitry Andric /// \param __a 67506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the inputs. 67606c3fb27SDimitry Andric /// \param __b 67706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the inputs. 67806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 6790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 6800b57cec5SDimitry Andric _mm256_cmpeq_epi32(__m256i __a, __m256i __b) 6810b57cec5SDimitry Andric { 6820b57cec5SDimitry Andric return (__m256i)((__v8si)__a == (__v8si)__b); 6830b57cec5SDimitry Andric } 6840b57cec5SDimitry Andric 68506c3fb27SDimitry Andric /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in 68606c3fb27SDimitry Andric /// \a __a and \a __b for equality and returns the outcomes in the 68706c3fb27SDimitry Andric /// corresponding elements of the 256-bit result. 68806c3fb27SDimitry Andric /// 68906c3fb27SDimitry Andric /// \code{.operation} 69006c3fb27SDimitry Andric /// FOR i := 0 TO 3 69106c3fb27SDimitry Andric /// j := i*64 69206c3fb27SDimitry Andric /// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 69306c3fb27SDimitry Andric /// ENDFOR 69406c3fb27SDimitry Andric /// \endcode 69506c3fb27SDimitry Andric /// 69606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 69706c3fb27SDimitry Andric /// 69806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPEQQ instruction. 69906c3fb27SDimitry Andric /// 70006c3fb27SDimitry Andric /// \param __a 70106c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing one of the inputs. 70206c3fb27SDimitry Andric /// \param __b 70306c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing one of the inputs. 70406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 7050b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 7060b57cec5SDimitry Andric _mm256_cmpeq_epi64(__m256i __a, __m256i __b) 7070b57cec5SDimitry Andric { 7080b57cec5SDimitry Andric return (__m256i)((__v4di)__a == (__v4di)__b); 7090b57cec5SDimitry Andric } 7100b57cec5SDimitry Andric 71106c3fb27SDimitry Andric /// Compares corresponding signed bytes in the 256-bit integer vectors in 71206c3fb27SDimitry Andric /// \a __a and \a __b for greater-than and returns the outcomes in the 71306c3fb27SDimitry Andric /// corresponding bytes of the 256-bit result. 71406c3fb27SDimitry Andric /// 71506c3fb27SDimitry Andric /// \code{.operation} 71606c3fb27SDimitry Andric /// FOR i := 0 TO 31 71706c3fb27SDimitry Andric /// j := i*8 71806c3fb27SDimitry Andric /// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0 71906c3fb27SDimitry Andric /// ENDFOR 72006c3fb27SDimitry Andric /// \endcode 72106c3fb27SDimitry Andric /// 72206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 72306c3fb27SDimitry Andric /// 72406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTB instruction. 72506c3fb27SDimitry Andric /// 72606c3fb27SDimitry Andric /// \param __a 72706c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the inputs. 72806c3fb27SDimitry Andric /// \param __b 72906c3fb27SDimitry Andric /// A 256-bit integer vector containing one of the inputs. 73006c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 7310b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 7320b57cec5SDimitry Andric _mm256_cmpgt_epi8(__m256i __a, __m256i __b) 7330b57cec5SDimitry Andric { 7340b57cec5SDimitry Andric /* This function always performs a signed comparison, but __v32qi is a char 7350b57cec5SDimitry Andric which may be signed or unsigned, so use __v32qs. */ 7360b57cec5SDimitry Andric return (__m256i)((__v32qs)__a > (__v32qs)__b); 7370b57cec5SDimitry Andric } 7380b57cec5SDimitry Andric 73906c3fb27SDimitry Andric /// Compares corresponding signed elements in the 256-bit vectors of 74006c3fb27SDimitry Andric /// [16 x i16] in \a __a and \a __b for greater-than and returns the 74106c3fb27SDimitry Andric /// outcomes in the corresponding elements of the 256-bit result. 74206c3fb27SDimitry Andric /// 74306c3fb27SDimitry Andric /// \code{.operation} 74406c3fb27SDimitry Andric /// FOR i := 0 TO 15 74506c3fb27SDimitry Andric /// j := i*16 74606c3fb27SDimitry Andric /// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0 74706c3fb27SDimitry Andric /// ENDFOR 74806c3fb27SDimitry Andric /// \endcode 74906c3fb27SDimitry Andric /// 75006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 75106c3fb27SDimitry Andric /// 75206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTW instruction. 75306c3fb27SDimitry Andric /// 75406c3fb27SDimitry Andric /// \param __a 75506c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the inputs. 75606c3fb27SDimitry Andric /// \param __b 75706c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the inputs. 75806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 7590b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 7600b57cec5SDimitry Andric _mm256_cmpgt_epi16(__m256i __a, __m256i __b) 7610b57cec5SDimitry Andric { 7620b57cec5SDimitry Andric return (__m256i)((__v16hi)__a > (__v16hi)__b); 7630b57cec5SDimitry Andric } 7640b57cec5SDimitry Andric 76506c3fb27SDimitry Andric /// Compares corresponding signed elements in the 256-bit vectors of 76606c3fb27SDimitry Andric /// [8 x i32] in \a __a and \a __b for greater-than and returns the 76706c3fb27SDimitry Andric /// outcomes in the corresponding elements of the 256-bit result. 76806c3fb27SDimitry Andric /// 76906c3fb27SDimitry Andric /// \code{.operation} 77006c3fb27SDimitry Andric /// FOR i := 0 TO 7 77106c3fb27SDimitry Andric /// j := i*32 77206c3fb27SDimitry Andric /// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0 77306c3fb27SDimitry Andric /// ENDFOR 77406c3fb27SDimitry Andric /// \endcode 77506c3fb27SDimitry Andric /// 77606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 77706c3fb27SDimitry Andric /// 77806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTD instruction. 77906c3fb27SDimitry Andric /// 78006c3fb27SDimitry Andric /// \param __a 78106c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the inputs. 78206c3fb27SDimitry Andric /// \param __b 78306c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the inputs. 78406c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 7850b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 7860b57cec5SDimitry Andric _mm256_cmpgt_epi32(__m256i __a, __m256i __b) 7870b57cec5SDimitry Andric { 7880b57cec5SDimitry Andric return (__m256i)((__v8si)__a > (__v8si)__b); 7890b57cec5SDimitry Andric } 7900b57cec5SDimitry Andric 79106c3fb27SDimitry Andric /// Compares corresponding signed elements in the 256-bit vectors of 79206c3fb27SDimitry Andric /// [4 x i64] in \a __a and \a __b for greater-than and returns the 79306c3fb27SDimitry Andric /// outcomes in the corresponding elements of the 256-bit result. 79406c3fb27SDimitry Andric /// 79506c3fb27SDimitry Andric /// \code{.operation} 79606c3fb27SDimitry Andric /// FOR i := 0 TO 3 79706c3fb27SDimitry Andric /// j := i*64 79806c3fb27SDimitry Andric /// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0 79906c3fb27SDimitry Andric /// ENDFOR 80006c3fb27SDimitry Andric /// \endcode 80106c3fb27SDimitry Andric /// 80206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 80306c3fb27SDimitry Andric /// 80406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPCMPGTQ instruction. 80506c3fb27SDimitry Andric /// 80606c3fb27SDimitry Andric /// \param __a 80706c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing one of the inputs. 80806c3fb27SDimitry Andric /// \param __b 80906c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing one of the inputs. 81006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 8110b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 8120b57cec5SDimitry Andric _mm256_cmpgt_epi64(__m256i __a, __m256i __b) 8130b57cec5SDimitry Andric { 8140b57cec5SDimitry Andric return (__m256i)((__v4di)__a > (__v4di)__b); 8150b57cec5SDimitry Andric } 8160b57cec5SDimitry Andric 81706c3fb27SDimitry Andric /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 81806c3fb27SDimitry Andric /// vectors of [16 x i16] and returns the lower 16 bits of each sum in an 81906c3fb27SDimitry Andric /// element of the [16 x i16] result (overflow is ignored). Sums from 82006c3fb27SDimitry Andric /// \a __a are returned in the lower 64 bits of each 128-bit half of the 82106c3fb27SDimitry Andric /// result; sums from \a __b are returned in the upper 64 bits of each 82206c3fb27SDimitry Andric /// 128-bit half of the result. 82306c3fb27SDimitry Andric /// 82406c3fb27SDimitry Andric /// \code{.operation} 82506c3fb27SDimitry Andric /// FOR i := 0 TO 1 82606c3fb27SDimitry Andric /// j := i*128 82706c3fb27SDimitry Andric /// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16] 82806c3fb27SDimitry Andric /// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48] 82906c3fb27SDimitry Andric /// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80] 83006c3fb27SDimitry Andric /// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112] 83106c3fb27SDimitry Andric /// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16] 83206c3fb27SDimitry Andric /// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48] 83306c3fb27SDimitry Andric /// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80] 83406c3fb27SDimitry Andric /// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112] 83506c3fb27SDimitry Andric /// ENDFOR 83606c3fb27SDimitry Andric /// \endcode 83706c3fb27SDimitry Andric /// 83806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 83906c3fb27SDimitry Andric /// 84006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHADDW instruction. 84106c3fb27SDimitry Andric /// 84206c3fb27SDimitry Andric /// \param __a 84306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 84406c3fb27SDimitry Andric /// \param __b 84506c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 84606c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums. 8470b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 8480b57cec5SDimitry Andric _mm256_hadd_epi16(__m256i __a, __m256i __b) 8490b57cec5SDimitry Andric { 8500b57cec5SDimitry Andric return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 8510b57cec5SDimitry Andric } 8520b57cec5SDimitry Andric 85306c3fb27SDimitry Andric /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit 85406c3fb27SDimitry Andric /// vectors of [8 x i32] and returns the lower 32 bits of each sum in an 85506c3fb27SDimitry Andric /// element of the [8 x i32] result (overflow is ignored). Sums from \a __a 85606c3fb27SDimitry Andric /// are returned in the lower 64 bits of each 128-bit half of the result; 85706c3fb27SDimitry Andric /// sums from \a __b are returned in the upper 64 bits of each 128-bit half 85806c3fb27SDimitry Andric /// of the result. 85906c3fb27SDimitry Andric /// 86006c3fb27SDimitry Andric /// \code{.operation} 86106c3fb27SDimitry Andric /// FOR i := 0 TO 1 86206c3fb27SDimitry Andric /// j := i*128 86306c3fb27SDimitry Andric /// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32] 86406c3fb27SDimitry Andric /// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96] 86506c3fb27SDimitry Andric /// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32] 86606c3fb27SDimitry Andric /// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96] 86706c3fb27SDimitry Andric /// ENDFOR 86806c3fb27SDimitry Andric /// \endcode 86906c3fb27SDimitry Andric /// 87006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 87106c3fb27SDimitry Andric /// 87206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHADDD instruction. 87306c3fb27SDimitry Andric /// 87406c3fb27SDimitry Andric /// \param __a 87506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 87606c3fb27SDimitry Andric /// \param __b 87706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 87806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sums. 8790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 8800b57cec5SDimitry Andric _mm256_hadd_epi32(__m256i __a, __m256i __b) 8810b57cec5SDimitry Andric { 8820b57cec5SDimitry Andric return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 8830b57cec5SDimitry Andric } 8840b57cec5SDimitry Andric 88506c3fb27SDimitry Andric /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit 88606c3fb27SDimitry Andric /// vectors of [16 x i16] using signed saturation and returns each sum in 88706c3fb27SDimitry Andric /// an element of the [16 x i16] result. Sums from \a __a are returned in 88806c3fb27SDimitry Andric /// the lower 64 bits of each 128-bit half of the result; sums from \a __b 88906c3fb27SDimitry Andric /// are returned in the upper 64 bits of each 128-bit half of the result. 89006c3fb27SDimitry Andric /// 89106c3fb27SDimitry Andric /// \code{.operation} 89206c3fb27SDimitry Andric /// FOR i := 0 TO 1 89306c3fb27SDimitry Andric /// j := i*128 89406c3fb27SDimitry Andric /// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16]) 89506c3fb27SDimitry Andric /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48]) 89606c3fb27SDimitry Andric /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80]) 89706c3fb27SDimitry Andric /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112]) 89806c3fb27SDimitry Andric /// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16]) 89906c3fb27SDimitry Andric /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48]) 90006c3fb27SDimitry Andric /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80]) 90106c3fb27SDimitry Andric /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112]) 90206c3fb27SDimitry Andric /// ENDFOR 90306c3fb27SDimitry Andric /// \endcode 90406c3fb27SDimitry Andric /// 90506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 90606c3fb27SDimitry Andric /// 90706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHADDSW instruction. 90806c3fb27SDimitry Andric /// 90906c3fb27SDimitry Andric /// \param __a 91006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 91106c3fb27SDimitry Andric /// \param __b 91206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 91306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sums. 9140b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 9150b57cec5SDimitry Andric _mm256_hadds_epi16(__m256i __a, __m256i __b) 9160b57cec5SDimitry Andric { 9170b57cec5SDimitry Andric return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 9180b57cec5SDimitry Andric } 9190b57cec5SDimitry Andric 92006c3fb27SDimitry Andric /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 92106c3fb27SDimitry Andric /// vectors of [16 x i16] and returns the lower 16 bits of each difference 92206c3fb27SDimitry Andric /// in an element of the [16 x i16] result (overflow is ignored). 92306c3fb27SDimitry Andric /// Differences from \a __a are returned in the lower 64 bits of each 92406c3fb27SDimitry Andric /// 128-bit half of the result; differences from \a __b are returned in the 92506c3fb27SDimitry Andric /// upper 64 bits of each 128-bit half of the result. 92606c3fb27SDimitry Andric /// 92706c3fb27SDimitry Andric /// \code{.operation} 92806c3fb27SDimitry Andric /// FOR i := 0 TO 1 92906c3fb27SDimitry Andric /// j := i*128 93006c3fb27SDimitry Andric /// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16] 93106c3fb27SDimitry Andric /// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48] 93206c3fb27SDimitry Andric /// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80] 93306c3fb27SDimitry Andric /// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112] 93406c3fb27SDimitry Andric /// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16] 93506c3fb27SDimitry Andric /// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48] 93606c3fb27SDimitry Andric /// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80] 93706c3fb27SDimitry Andric /// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112] 93806c3fb27SDimitry Andric /// ENDFOR 93906c3fb27SDimitry Andric /// \endcode 94006c3fb27SDimitry Andric /// 94106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 94206c3fb27SDimitry Andric /// 94306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHSUBW instruction. 94406c3fb27SDimitry Andric /// 94506c3fb27SDimitry Andric /// \param __a 94606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 94706c3fb27SDimitry Andric /// \param __b 94806c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 94906c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences. 9500b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 9510b57cec5SDimitry Andric _mm256_hsub_epi16(__m256i __a, __m256i __b) 9520b57cec5SDimitry Andric { 9530b57cec5SDimitry Andric return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 9540b57cec5SDimitry Andric } 9550b57cec5SDimitry Andric 95606c3fb27SDimitry Andric /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit 95706c3fb27SDimitry Andric /// vectors of [8 x i32] and returns the lower 32 bits of each difference in 95806c3fb27SDimitry Andric /// an element of the [8 x i32] result (overflow is ignored). Differences 95906c3fb27SDimitry Andric /// from \a __a are returned in the lower 64 bits of each 128-bit half of 96006c3fb27SDimitry Andric /// the result; differences from \a __b are returned in the upper 64 bits 96106c3fb27SDimitry Andric /// of each 128-bit half of the result. 96206c3fb27SDimitry Andric /// 96306c3fb27SDimitry Andric /// \code{.operation} 96406c3fb27SDimitry Andric /// FOR i := 0 TO 1 96506c3fb27SDimitry Andric /// j := i*128 96606c3fb27SDimitry Andric /// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32] 96706c3fb27SDimitry Andric /// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96] 96806c3fb27SDimitry Andric /// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32] 96906c3fb27SDimitry Andric /// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96] 97006c3fb27SDimitry Andric /// ENDFOR 97106c3fb27SDimitry Andric /// \endcode 97206c3fb27SDimitry Andric /// 97306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 97406c3fb27SDimitry Andric /// 97506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHSUBD instruction. 97606c3fb27SDimitry Andric /// 97706c3fb27SDimitry Andric /// \param __a 97806c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 97906c3fb27SDimitry Andric /// \param __b 98006c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 98106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the differences. 9820b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 9830b57cec5SDimitry Andric _mm256_hsub_epi32(__m256i __a, __m256i __b) 9840b57cec5SDimitry Andric { 9850b57cec5SDimitry Andric return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 9860b57cec5SDimitry Andric } 9870b57cec5SDimitry Andric 98806c3fb27SDimitry Andric /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit 98906c3fb27SDimitry Andric /// vectors of [16 x i16] using signed saturation and returns each sum in 99006c3fb27SDimitry Andric /// an element of the [16 x i16] result. Differences from \a __a are 99106c3fb27SDimitry Andric /// returned in the lower 64 bits of each 128-bit half of the result; 99206c3fb27SDimitry Andric /// differences from \a __b are returned in the upper 64 bits of each 99306c3fb27SDimitry Andric /// 128-bit half of the result. 99406c3fb27SDimitry Andric /// 99506c3fb27SDimitry Andric /// \code{.operation} 99606c3fb27SDimitry Andric /// FOR i := 0 TO 1 99706c3fb27SDimitry Andric /// j := i*128 99806c3fb27SDimitry Andric /// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16]) 99906c3fb27SDimitry Andric /// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48]) 100006c3fb27SDimitry Andric /// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80]) 100106c3fb27SDimitry Andric /// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112]) 100206c3fb27SDimitry Andric /// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16]) 100306c3fb27SDimitry Andric /// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48]) 100406c3fb27SDimitry Andric /// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80]) 100506c3fb27SDimitry Andric /// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112]) 100606c3fb27SDimitry Andric /// ENDFOR 100706c3fb27SDimitry Andric /// \endcode 100806c3fb27SDimitry Andric /// 100906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 101006c3fb27SDimitry Andric /// 101106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPHSUBSW instruction. 101206c3fb27SDimitry Andric /// 101306c3fb27SDimitry Andric /// \param __a 101406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 101506c3fb27SDimitry Andric /// \param __b 101606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 101706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences. 10180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 10190b57cec5SDimitry Andric _mm256_hsubs_epi16(__m256i __a, __m256i __b) 10200b57cec5SDimitry Andric { 10210b57cec5SDimitry Andric return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 10220b57cec5SDimitry Andric } 10230b57cec5SDimitry Andric 102406c3fb27SDimitry Andric /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a 102506c3fb27SDimitry Andric /// with the corresponding signed byte from the 256-bit integer vector in 102606c3fb27SDimitry Andric /// \a __b, forming signed 16-bit intermediate products. Adds adjacent 102706c3fb27SDimitry Andric /// pairs of those products using signed saturation to form 16-bit sums 102806c3fb27SDimitry Andric /// returned as elements of the [16 x i16] result. 102906c3fb27SDimitry Andric /// 103006c3fb27SDimitry Andric /// \code{.operation} 103106c3fb27SDimitry Andric /// FOR i := 0 TO 15 103206c3fb27SDimitry Andric /// j := i*16 103306c3fb27SDimitry Andric /// temp1 := __a[j+7:j] * __b[j+7:j] 103406c3fb27SDimitry Andric /// temp2 := __a[j+15:j+8] * __b[j+15:j+8] 103506c3fb27SDimitry Andric /// result[j+15:j] := SATURATE16(temp1 + temp2) 103606c3fb27SDimitry Andric /// ENDFOR 103706c3fb27SDimitry Andric /// \endcode 103806c3fb27SDimitry Andric /// 103906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 104006c3fb27SDimitry Andric /// 104106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMADDUBSW instruction. 104206c3fb27SDimitry Andric /// 104306c3fb27SDimitry Andric /// \param __a 104406c3fb27SDimitry Andric /// A 256-bit vector containing one of the source operands. 104506c3fb27SDimitry Andric /// \param __b 104606c3fb27SDimitry Andric /// A 256-bit vector containing one of the source operands. 104706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 10480b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 10490b57cec5SDimitry Andric _mm256_maddubs_epi16(__m256i __a, __m256i __b) 10500b57cec5SDimitry Andric { 10510b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 10520b57cec5SDimitry Andric } 10530b57cec5SDimitry Andric 105406c3fb27SDimitry Andric /// Multiplies corresponding 16-bit elements of two 256-bit vectors of 105506c3fb27SDimitry Andric /// [16 x i16], forming 32-bit intermediate products, and adds pairs of 105606c3fb27SDimitry Andric /// those products to form 32-bit sums returned as elements of the 105706c3fb27SDimitry Andric /// [8 x i32] result. 105806c3fb27SDimitry Andric /// 105906c3fb27SDimitry Andric /// There is only one wraparound case: when all four of the 16-bit sources 106006c3fb27SDimitry Andric /// are \c 0x8000, the result will be \c 0x80000000. 106106c3fb27SDimitry Andric /// 106206c3fb27SDimitry Andric /// \code{.operation} 106306c3fb27SDimitry Andric /// FOR i := 0 TO 7 106406c3fb27SDimitry Andric /// j := i*32 106506c3fb27SDimitry Andric /// temp1 := __a[j+15:j] * __b[j+15:j] 106606c3fb27SDimitry Andric /// temp2 := __a[j+31:j+16] * __b[j+31:j+16] 106706c3fb27SDimitry Andric /// result[j+31:j] := temp1 + temp2 106806c3fb27SDimitry Andric /// ENDFOR 106906c3fb27SDimitry Andric /// \endcode 107006c3fb27SDimitry Andric /// 107106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 107206c3fb27SDimitry Andric /// 107306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMADDWD instruction. 107406c3fb27SDimitry Andric /// 107506c3fb27SDimitry Andric /// \param __a 107606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 107706c3fb27SDimitry Andric /// \param __b 107806c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 107906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 10800b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 10810b57cec5SDimitry Andric _mm256_madd_epi16(__m256i __a, __m256i __b) 10820b57cec5SDimitry Andric { 10830b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 10840b57cec5SDimitry Andric } 10850b57cec5SDimitry Andric 108606c3fb27SDimitry Andric /// Compares the corresponding signed bytes in the two 256-bit integer vectors 108706c3fb27SDimitry Andric /// in \a __a and \a __b and returns the larger of each pair in the 108806c3fb27SDimitry Andric /// corresponding byte of the 256-bit result. 108906c3fb27SDimitry Andric /// 109006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 109106c3fb27SDimitry Andric /// 109206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXSB instruction. 109306c3fb27SDimitry Andric /// 109406c3fb27SDimitry Andric /// \param __a 109506c3fb27SDimitry Andric /// A 256-bit integer vector. 109606c3fb27SDimitry Andric /// \param __b 109706c3fb27SDimitry Andric /// A 256-bit integer vector. 109806c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 10990b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 11000b57cec5SDimitry Andric _mm256_max_epi8(__m256i __a, __m256i __b) 11010b57cec5SDimitry Andric { 110204eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); 11030b57cec5SDimitry Andric } 11040b57cec5SDimitry Andric 110506c3fb27SDimitry Andric /// Compares the corresponding signed 16-bit integers in the two 256-bit 110606c3fb27SDimitry Andric /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 110706c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 110806c3fb27SDimitry Andric /// 110906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 111006c3fb27SDimitry Andric /// 111106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXSW instruction. 111206c3fb27SDimitry Andric /// 111306c3fb27SDimitry Andric /// \param __a 111406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 111506c3fb27SDimitry Andric /// \param __b 111606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 111706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 11180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 11190b57cec5SDimitry Andric _mm256_max_epi16(__m256i __a, __m256i __b) 11200b57cec5SDimitry Andric { 112104eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); 11220b57cec5SDimitry Andric } 11230b57cec5SDimitry Andric 112406c3fb27SDimitry Andric /// Compares the corresponding signed 32-bit integers in the two 256-bit 112506c3fb27SDimitry Andric /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 112606c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 112706c3fb27SDimitry Andric /// 112806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 112906c3fb27SDimitry Andric /// 113006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXSD instruction. 113106c3fb27SDimitry Andric /// 113206c3fb27SDimitry Andric /// \param __a 113306c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 113406c3fb27SDimitry Andric /// \param __b 113506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 113606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 11370b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 11380b57cec5SDimitry Andric _mm256_max_epi32(__m256i __a, __m256i __b) 11390b57cec5SDimitry Andric { 114004eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); 11410b57cec5SDimitry Andric } 11420b57cec5SDimitry Andric 114306c3fb27SDimitry Andric /// Compares the corresponding unsigned bytes in the two 256-bit integer 114406c3fb27SDimitry Andric /// vectors in \a __a and \a __b and returns the larger of each pair in 114506c3fb27SDimitry Andric /// the corresponding byte of the 256-bit result. 114606c3fb27SDimitry Andric /// 114706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 114806c3fb27SDimitry Andric /// 114906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXUB instruction. 115006c3fb27SDimitry Andric /// 115106c3fb27SDimitry Andric /// \param __a 115206c3fb27SDimitry Andric /// A 256-bit integer vector. 115306c3fb27SDimitry Andric /// \param __b 115406c3fb27SDimitry Andric /// A 256-bit integer vector. 115506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 11560b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 11570b57cec5SDimitry Andric _mm256_max_epu8(__m256i __a, __m256i __b) 11580b57cec5SDimitry Andric { 115904eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); 11600b57cec5SDimitry Andric } 11610b57cec5SDimitry Andric 116206c3fb27SDimitry Andric /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 116306c3fb27SDimitry Andric /// vectors of [16 x i16] in \a __a and \a __b and returns the larger of 116406c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 116506c3fb27SDimitry Andric /// 116606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 116706c3fb27SDimitry Andric /// 116806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXUW instruction. 116906c3fb27SDimitry Andric /// 117006c3fb27SDimitry Andric /// \param __a 117106c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 117206c3fb27SDimitry Andric /// \param __b 117306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 117406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 11750b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 11760b57cec5SDimitry Andric _mm256_max_epu16(__m256i __a, __m256i __b) 11770b57cec5SDimitry Andric { 117804eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); 11790b57cec5SDimitry Andric } 11800b57cec5SDimitry Andric 118106c3fb27SDimitry Andric /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 118206c3fb27SDimitry Andric /// vectors of [8 x i32] in \a __a and \a __b and returns the larger of 118306c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 118406c3fb27SDimitry Andric /// 118506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 118606c3fb27SDimitry Andric /// 118706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMAXUD instruction. 118806c3fb27SDimitry Andric /// 118906c3fb27SDimitry Andric /// \param __a 119006c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 119106c3fb27SDimitry Andric /// \param __b 119206c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 119306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 11940b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 11950b57cec5SDimitry Andric _mm256_max_epu32(__m256i __a, __m256i __b) 11960b57cec5SDimitry Andric { 119704eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); 11980b57cec5SDimitry Andric } 11990b57cec5SDimitry Andric 120006c3fb27SDimitry Andric /// Compares the corresponding signed bytes in the two 256-bit integer vectors 120106c3fb27SDimitry Andric /// in \a __a and \a __b and returns the smaller of each pair in the 120206c3fb27SDimitry Andric /// corresponding byte of the 256-bit result. 120306c3fb27SDimitry Andric /// 120406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 120506c3fb27SDimitry Andric /// 120606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINSB instruction. 120706c3fb27SDimitry Andric /// 120806c3fb27SDimitry Andric /// \param __a 120906c3fb27SDimitry Andric /// A 256-bit integer vector. 121006c3fb27SDimitry Andric /// \param __b 121106c3fb27SDimitry Andric /// A 256-bit integer vector. 121206c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 12130b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 12140b57cec5SDimitry Andric _mm256_min_epi8(__m256i __a, __m256i __b) 12150b57cec5SDimitry Andric { 121604eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); 12170b57cec5SDimitry Andric } 12180b57cec5SDimitry Andric 121906c3fb27SDimitry Andric /// Compares the corresponding signed 16-bit integers in the two 256-bit 122006c3fb27SDimitry Andric /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 122106c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 122206c3fb27SDimitry Andric /// 122306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 122406c3fb27SDimitry Andric /// 122506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINSW instruction. 122606c3fb27SDimitry Andric /// 122706c3fb27SDimitry Andric /// \param __a 122806c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 122906c3fb27SDimitry Andric /// \param __b 123006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 123106c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 12320b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 12330b57cec5SDimitry Andric _mm256_min_epi16(__m256i __a, __m256i __b) 12340b57cec5SDimitry Andric { 123504eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); 12360b57cec5SDimitry Andric } 12370b57cec5SDimitry Andric 123806c3fb27SDimitry Andric /// Compares the corresponding signed 32-bit integers in the two 256-bit 123906c3fb27SDimitry Andric /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 124006c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 124106c3fb27SDimitry Andric /// 124206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 124306c3fb27SDimitry Andric /// 124406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINSD instruction. 124506c3fb27SDimitry Andric /// 124606c3fb27SDimitry Andric /// \param __a 124706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 124806c3fb27SDimitry Andric /// \param __b 124906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 125006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 12510b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 12520b57cec5SDimitry Andric _mm256_min_epi32(__m256i __a, __m256i __b) 12530b57cec5SDimitry Andric { 125404eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); 12550b57cec5SDimitry Andric } 12560b57cec5SDimitry Andric 125706c3fb27SDimitry Andric /// Compares the corresponding unsigned bytes in the two 256-bit integer 125806c3fb27SDimitry Andric /// vectors in \a __a and \a __b and returns the smaller of each pair in 125906c3fb27SDimitry Andric /// the corresponding byte of the 256-bit result. 126006c3fb27SDimitry Andric /// 126106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 126206c3fb27SDimitry Andric /// 126306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINUB instruction. 126406c3fb27SDimitry Andric /// 126506c3fb27SDimitry Andric /// \param __a 126606c3fb27SDimitry Andric /// A 256-bit integer vector. 126706c3fb27SDimitry Andric /// \param __b 126806c3fb27SDimitry Andric /// A 256-bit integer vector. 126906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 12700b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 12710b57cec5SDimitry Andric _mm256_min_epu8(__m256i __a, __m256i __b) 12720b57cec5SDimitry Andric { 127304eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); 12740b57cec5SDimitry Andric } 12750b57cec5SDimitry Andric 127606c3fb27SDimitry Andric /// Compares the corresponding unsigned 16-bit integers in the two 256-bit 127706c3fb27SDimitry Andric /// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of 127806c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 127906c3fb27SDimitry Andric /// 128006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 128106c3fb27SDimitry Andric /// 128206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINUW instruction. 128306c3fb27SDimitry Andric /// 128406c3fb27SDimitry Andric /// \param __a 128506c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 128606c3fb27SDimitry Andric /// \param __b 128706c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 128806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 12890b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 12900b57cec5SDimitry Andric _mm256_min_epu16(__m256i __a, __m256i __b) 12910b57cec5SDimitry Andric { 129204eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); 12930b57cec5SDimitry Andric } 12940b57cec5SDimitry Andric 129506c3fb27SDimitry Andric /// Compares the corresponding unsigned 32-bit integers in the two 256-bit 129606c3fb27SDimitry Andric /// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of 129706c3fb27SDimitry Andric /// each pair in the corresponding element of the 256-bit result. 129806c3fb27SDimitry Andric /// 129906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 130006c3fb27SDimitry Andric /// 130106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMINUD instruction. 130206c3fb27SDimitry Andric /// 130306c3fb27SDimitry Andric /// \param __a 130406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 130506c3fb27SDimitry Andric /// \param __b 130606c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 130706c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 13080b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 13090b57cec5SDimitry Andric _mm256_min_epu32(__m256i __a, __m256i __b) 13100b57cec5SDimitry Andric { 131104eeddc0SDimitry Andric return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); 13120b57cec5SDimitry Andric } 13130b57cec5SDimitry Andric 1314*5f757f3fSDimitry Andric /// Creates a 32-bit integer mask from the most significant bit of each byte 1315*5f757f3fSDimitry Andric /// in the 256-bit integer vector in \a __a and returns the result. 1316*5f757f3fSDimitry Andric /// 1317*5f757f3fSDimitry Andric /// \code{.operation} 1318*5f757f3fSDimitry Andric /// FOR i := 0 TO 31 1319*5f757f3fSDimitry Andric /// j := i*8 1320*5f757f3fSDimitry Andric /// result[i] := __a[j+7] 1321*5f757f3fSDimitry Andric /// ENDFOR 1322*5f757f3fSDimitry Andric /// \endcode 1323*5f757f3fSDimitry Andric /// 1324*5f757f3fSDimitry Andric /// \headerfile <immintrin.h> 1325*5f757f3fSDimitry Andric /// 1326*5f757f3fSDimitry Andric /// This intrinsic corresponds to the \c VPMOVMSKB instruction. 1327*5f757f3fSDimitry Andric /// 1328*5f757f3fSDimitry Andric /// \param __a 1329*5f757f3fSDimitry Andric /// A 256-bit integer vector containing the source bytes. 1330*5f757f3fSDimitry Andric /// \returns The 32-bit integer mask. 13310b57cec5SDimitry Andric static __inline__ int __DEFAULT_FN_ATTRS256 13320b57cec5SDimitry Andric _mm256_movemask_epi8(__m256i __a) 13330b57cec5SDimitry Andric { 13340b57cec5SDimitry Andric return __builtin_ia32_pmovmskb256((__v32qi)__a); 13350b57cec5SDimitry Andric } 13360b57cec5SDimitry Andric 133706c3fb27SDimitry Andric /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns 133806c3fb27SDimitry Andric /// the 16-bit values in the corresponding elements of a 256-bit vector 133906c3fb27SDimitry Andric /// of [16 x i16]. 134006c3fb27SDimitry Andric /// 134106c3fb27SDimitry Andric /// \code{.operation} 134206c3fb27SDimitry Andric /// FOR i := 0 TO 15 134306c3fb27SDimitry Andric /// j := i*8 134406c3fb27SDimitry Andric /// k := i*16 134506c3fb27SDimitry Andric /// result[k+15:k] := SignExtend(__V[j+7:j]) 134606c3fb27SDimitry Andric /// ENDFOR 134706c3fb27SDimitry Andric /// \endcode 134806c3fb27SDimitry Andric /// 134906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 135006c3fb27SDimitry Andric /// 135106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXBW instruction. 135206c3fb27SDimitry Andric /// 135306c3fb27SDimitry Andric /// \param __V 135406c3fb27SDimitry Andric /// A 128-bit integer vector containing the source bytes. 135506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the sign-extended 135606c3fb27SDimitry Andric /// values. 13570b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 13580b57cec5SDimitry Andric _mm256_cvtepi8_epi16(__m128i __V) 13590b57cec5SDimitry Andric { 13600b57cec5SDimitry Andric /* This function always performs a signed extension, but __v16qi is a char 13610b57cec5SDimitry Andric which may be signed or unsigned, so use __v16qs. */ 13620b57cec5SDimitry Andric return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); 13630b57cec5SDimitry Andric } 13640b57cec5SDimitry Andric 136506c3fb27SDimitry Andric /// Sign-extends bytes from the lower half of the 128-bit integer vector in 136606c3fb27SDimitry Andric /// \a __V and returns the 32-bit values in the corresponding elements of a 136706c3fb27SDimitry Andric /// 256-bit vector of [8 x i32]. 136806c3fb27SDimitry Andric /// 136906c3fb27SDimitry Andric /// \code{.operation} 137006c3fb27SDimitry Andric /// FOR i := 0 TO 7 137106c3fb27SDimitry Andric /// j := i*8 137206c3fb27SDimitry Andric /// k := i*32 137306c3fb27SDimitry Andric /// result[k+31:k] := SignExtend(__V[j+7:j]) 137406c3fb27SDimitry Andric /// ENDFOR 137506c3fb27SDimitry Andric /// \endcode 137606c3fb27SDimitry Andric /// 137706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 137806c3fb27SDimitry Andric /// 137906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXBD instruction. 138006c3fb27SDimitry Andric /// 138106c3fb27SDimitry Andric /// \param __V 138206c3fb27SDimitry Andric /// A 128-bit integer vector containing the source bytes. 138306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 138406c3fb27SDimitry Andric /// values. 13850b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 13860b57cec5SDimitry Andric _mm256_cvtepi8_epi32(__m128i __V) 13870b57cec5SDimitry Andric { 13880b57cec5SDimitry Andric /* This function always performs a signed extension, but __v16qi is a char 13890b57cec5SDimitry Andric which may be signed or unsigned, so use __v16qs. */ 13900b57cec5SDimitry Andric return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 13910b57cec5SDimitry Andric } 13920b57cec5SDimitry Andric 139306c3fb27SDimitry Andric /// Sign-extends the first four bytes from the 128-bit integer vector in 139406c3fb27SDimitry Andric /// \a __V and returns the 64-bit values in the corresponding elements of a 139506c3fb27SDimitry Andric /// 256-bit vector of [4 x i64]. 139606c3fb27SDimitry Andric /// 139706c3fb27SDimitry Andric /// \code{.operation} 139806c3fb27SDimitry Andric /// result[63:0] := SignExtend(__V[7:0]) 139906c3fb27SDimitry Andric /// result[127:64] := SignExtend(__V[15:8]) 140006c3fb27SDimitry Andric /// result[191:128] := SignExtend(__V[23:16]) 140106c3fb27SDimitry Andric /// result[255:192] := SignExtend(__V[31:24]) 140206c3fb27SDimitry Andric /// \endcode 140306c3fb27SDimitry Andric /// 140406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 140506c3fb27SDimitry Andric /// 140606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXBQ instruction. 140706c3fb27SDimitry Andric /// 140806c3fb27SDimitry Andric /// \param __V 140906c3fb27SDimitry Andric /// A 128-bit integer vector containing the source bytes. 141006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 141106c3fb27SDimitry Andric /// values. 14120b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 14130b57cec5SDimitry Andric _mm256_cvtepi8_epi64(__m128i __V) 14140b57cec5SDimitry Andric { 14150b57cec5SDimitry Andric /* This function always performs a signed extension, but __v16qi is a char 14160b57cec5SDimitry Andric which may be signed or unsigned, so use __v16qs. */ 14170b57cec5SDimitry Andric return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); 14180b57cec5SDimitry Andric } 14190b57cec5SDimitry Andric 142006c3fb27SDimitry Andric /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in 142106c3fb27SDimitry Andric /// \a __V and returns the 32-bit values in the corresponding elements of a 142206c3fb27SDimitry Andric /// 256-bit vector of [8 x i32]. 142306c3fb27SDimitry Andric /// 142406c3fb27SDimitry Andric /// \code{.operation} 142506c3fb27SDimitry Andric /// FOR i := 0 TO 7 142606c3fb27SDimitry Andric /// j := i*16 142706c3fb27SDimitry Andric /// k := i*32 142806c3fb27SDimitry Andric /// result[k+31:k] := SignExtend(__V[j+15:j]) 142906c3fb27SDimitry Andric /// ENDFOR 143006c3fb27SDimitry Andric /// \endcode 143106c3fb27SDimitry Andric /// 143206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 143306c3fb27SDimitry Andric /// 143406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXWD instruction. 143506c3fb27SDimitry Andric /// 143606c3fb27SDimitry Andric /// \param __V 143706c3fb27SDimitry Andric /// A 128-bit vector of [8 x i16] containing the source values. 143806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the sign-extended 143906c3fb27SDimitry Andric /// values. 14400b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 14410b57cec5SDimitry Andric _mm256_cvtepi16_epi32(__m128i __V) 14420b57cec5SDimitry Andric { 14430b57cec5SDimitry Andric return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); 14440b57cec5SDimitry Andric } 14450b57cec5SDimitry Andric 144606c3fb27SDimitry Andric /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of 144706c3fb27SDimitry Andric /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 144806c3fb27SDimitry Andric /// elements of a 256-bit vector of [4 x i64]. 144906c3fb27SDimitry Andric /// 145006c3fb27SDimitry Andric /// \code{.operation} 145106c3fb27SDimitry Andric /// result[63:0] := SignExtend(__V[15:0]) 145206c3fb27SDimitry Andric /// result[127:64] := SignExtend(__V[31:16]) 145306c3fb27SDimitry Andric /// result[191:128] := SignExtend(__V[47:32]) 145406c3fb27SDimitry Andric /// result[255:192] := SignExtend(__V[64:48]) 145506c3fb27SDimitry Andric /// \endcode 145606c3fb27SDimitry Andric /// 145706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 145806c3fb27SDimitry Andric /// 145906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 146006c3fb27SDimitry Andric /// 146106c3fb27SDimitry Andric /// \param __V 146206c3fb27SDimitry Andric /// A 128-bit vector of [8 x i16] containing the source values. 146306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 146406c3fb27SDimitry Andric /// values. 14650b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 14660b57cec5SDimitry Andric _mm256_cvtepi16_epi64(__m128i __V) 14670b57cec5SDimitry Andric { 14680b57cec5SDimitry Andric return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); 14690b57cec5SDimitry Andric } 14700b57cec5SDimitry Andric 147106c3fb27SDimitry Andric /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in 147206c3fb27SDimitry Andric /// \a __V and returns the 64-bit values in the corresponding elements of a 147306c3fb27SDimitry Andric /// 256-bit vector of [4 x i64]. 147406c3fb27SDimitry Andric /// 147506c3fb27SDimitry Andric /// \code{.operation} 147606c3fb27SDimitry Andric /// result[63:0] := SignExtend(__V[31:0]) 147706c3fb27SDimitry Andric /// result[127:64] := SignExtend(__V[63:32]) 147806c3fb27SDimitry Andric /// result[191:128] := SignExtend(__V[95:64]) 147906c3fb27SDimitry Andric /// result[255:192] := SignExtend(__V[127:96]) 148006c3fb27SDimitry Andric /// \endcode 148106c3fb27SDimitry Andric /// 148206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 148306c3fb27SDimitry Andric /// 148406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXDQ instruction. 148506c3fb27SDimitry Andric /// 148606c3fb27SDimitry Andric /// \param __V 148706c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the source values. 148806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the sign-extended 148906c3fb27SDimitry Andric /// values. 14900b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 14910b57cec5SDimitry Andric _mm256_cvtepi32_epi64(__m128i __V) 14920b57cec5SDimitry Andric { 14930b57cec5SDimitry Andric return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); 14940b57cec5SDimitry Andric } 14950b57cec5SDimitry Andric 149606c3fb27SDimitry Andric /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns 149706c3fb27SDimitry Andric /// the 16-bit values in the corresponding elements of a 256-bit vector 149806c3fb27SDimitry Andric /// of [16 x i16]. 149906c3fb27SDimitry Andric /// 150006c3fb27SDimitry Andric /// \code{.operation} 150106c3fb27SDimitry Andric /// FOR i := 0 TO 15 150206c3fb27SDimitry Andric /// j := i*8 150306c3fb27SDimitry Andric /// k := i*16 150406c3fb27SDimitry Andric /// result[k+15:k] := ZeroExtend(__V[j+7:j]) 150506c3fb27SDimitry Andric /// ENDFOR 150606c3fb27SDimitry Andric /// \endcode 150706c3fb27SDimitry Andric /// 150806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 150906c3fb27SDimitry Andric /// 151006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXBW instruction. 151106c3fb27SDimitry Andric /// 151206c3fb27SDimitry Andric /// \param __V 151306c3fb27SDimitry Andric /// A 128-bit integer vector containing the source bytes. 151406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the zero-extended 151506c3fb27SDimitry Andric /// values. 15160b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 15170b57cec5SDimitry Andric _mm256_cvtepu8_epi16(__m128i __V) 15180b57cec5SDimitry Andric { 15190b57cec5SDimitry Andric return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); 15200b57cec5SDimitry Andric } 15210b57cec5SDimitry Andric 152206c3fb27SDimitry Andric /// Zero-extends bytes from the lower half of the 128-bit integer vector in 152306c3fb27SDimitry Andric /// \a __V and returns the 32-bit values in the corresponding elements of a 152406c3fb27SDimitry Andric /// 256-bit vector of [8 x i32]. 152506c3fb27SDimitry Andric /// 152606c3fb27SDimitry Andric /// \code{.operation} 152706c3fb27SDimitry Andric /// FOR i := 0 TO 7 152806c3fb27SDimitry Andric /// j := i*8 152906c3fb27SDimitry Andric /// k := i*32 153006c3fb27SDimitry Andric /// result[k+31:k] := ZeroExtend(__V[j+7:j]) 153106c3fb27SDimitry Andric /// ENDFOR 153206c3fb27SDimitry Andric /// \endcode 153306c3fb27SDimitry Andric /// 153406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 153506c3fb27SDimitry Andric /// 153606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXBD instruction. 153706c3fb27SDimitry Andric /// 153806c3fb27SDimitry Andric /// \param __V 153906c3fb27SDimitry Andric /// A 128-bit integer vector containing the source bytes. 154006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 154106c3fb27SDimitry Andric /// values. 15420b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 15430b57cec5SDimitry Andric _mm256_cvtepu8_epi32(__m128i __V) 15440b57cec5SDimitry Andric { 15450b57cec5SDimitry Andric return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); 15460b57cec5SDimitry Andric } 15470b57cec5SDimitry Andric 154806c3fb27SDimitry Andric /// Zero-extends the first four bytes from the 128-bit integer vector in 154906c3fb27SDimitry Andric /// \a __V and returns the 64-bit values in the corresponding elements of a 155006c3fb27SDimitry Andric /// 256-bit vector of [4 x i64]. 155106c3fb27SDimitry Andric /// 155206c3fb27SDimitry Andric /// \code{.operation} 155306c3fb27SDimitry Andric /// result[63:0] := ZeroExtend(__V[7:0]) 155406c3fb27SDimitry Andric /// result[127:64] := ZeroExtend(__V[15:8]) 155506c3fb27SDimitry Andric /// result[191:128] := ZeroExtend(__V[23:16]) 155606c3fb27SDimitry Andric /// result[255:192] := ZeroExtend(__V[31:24]) 155706c3fb27SDimitry Andric /// \endcode 155806c3fb27SDimitry Andric /// 155906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 156006c3fb27SDimitry Andric /// 156106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXBQ instruction. 156206c3fb27SDimitry Andric /// 156306c3fb27SDimitry Andric /// \param __V 156406c3fb27SDimitry Andric /// A 128-bit integer vector containing the source bytes. 156506c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 156606c3fb27SDimitry Andric /// values. 15670b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 15680b57cec5SDimitry Andric _mm256_cvtepu8_epi64(__m128i __V) 15690b57cec5SDimitry Andric { 15700b57cec5SDimitry Andric return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); 15710b57cec5SDimitry Andric } 15720b57cec5SDimitry Andric 157306c3fb27SDimitry Andric /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in 157406c3fb27SDimitry Andric /// \a __V and returns the 32-bit values in the corresponding elements of a 157506c3fb27SDimitry Andric /// 256-bit vector of [8 x i32]. 157606c3fb27SDimitry Andric /// 157706c3fb27SDimitry Andric /// \code{.operation} 157806c3fb27SDimitry Andric /// FOR i := 0 TO 7 157906c3fb27SDimitry Andric /// j := i*16 158006c3fb27SDimitry Andric /// k := i*32 158106c3fb27SDimitry Andric /// result[k+31:k] := ZeroExtend(__V[j+15:j]) 158206c3fb27SDimitry Andric /// ENDFOR 158306c3fb27SDimitry Andric /// \endcode 158406c3fb27SDimitry Andric /// 158506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 158606c3fb27SDimitry Andric /// 158706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXWD instruction. 158806c3fb27SDimitry Andric /// 158906c3fb27SDimitry Andric /// \param __V 159006c3fb27SDimitry Andric /// A 128-bit vector of [8 x i16] containing the source values. 159106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the zero-extended 159206c3fb27SDimitry Andric /// values. 15930b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 15940b57cec5SDimitry Andric _mm256_cvtepu16_epi32(__m128i __V) 15950b57cec5SDimitry Andric { 15960b57cec5SDimitry Andric return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); 15970b57cec5SDimitry Andric } 15980b57cec5SDimitry Andric 159906c3fb27SDimitry Andric /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of 160006c3fb27SDimitry Andric /// [8 x i16] in \a __V and returns the 64-bit values in the corresponding 160106c3fb27SDimitry Andric /// elements of a 256-bit vector of [4 x i64]. 160206c3fb27SDimitry Andric /// 160306c3fb27SDimitry Andric /// \code{.operation} 160406c3fb27SDimitry Andric /// result[63:0] := ZeroExtend(__V[15:0]) 160506c3fb27SDimitry Andric /// result[127:64] := ZeroExtend(__V[31:16]) 160606c3fb27SDimitry Andric /// result[191:128] := ZeroExtend(__V[47:32]) 160706c3fb27SDimitry Andric /// result[255:192] := ZeroExtend(__V[64:48]) 160806c3fb27SDimitry Andric /// \endcode 160906c3fb27SDimitry Andric /// 161006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 161106c3fb27SDimitry Andric /// 161206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVSXWQ instruction. 161306c3fb27SDimitry Andric /// 161406c3fb27SDimitry Andric /// \param __V 161506c3fb27SDimitry Andric /// A 128-bit vector of [8 x i16] containing the source values. 161606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 161706c3fb27SDimitry Andric /// values. 16180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 16190b57cec5SDimitry Andric _mm256_cvtepu16_epi64(__m128i __V) 16200b57cec5SDimitry Andric { 16210b57cec5SDimitry Andric return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); 16220b57cec5SDimitry Andric } 16230b57cec5SDimitry Andric 162406c3fb27SDimitry Andric /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in 162506c3fb27SDimitry Andric /// \a __V and returns the 64-bit values in the corresponding elements of a 162606c3fb27SDimitry Andric /// 256-bit vector of [4 x i64]. 162706c3fb27SDimitry Andric /// 162806c3fb27SDimitry Andric /// \code{.operation} 162906c3fb27SDimitry Andric /// result[63:0] := ZeroExtend(__V[31:0]) 163006c3fb27SDimitry Andric /// result[127:64] := ZeroExtend(__V[63:32]) 163106c3fb27SDimitry Andric /// result[191:128] := ZeroExtend(__V[95:64]) 163206c3fb27SDimitry Andric /// result[255:192] := ZeroExtend(__V[127:96]) 163306c3fb27SDimitry Andric /// \endcode 163406c3fb27SDimitry Andric /// 163506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 163606c3fb27SDimitry Andric /// 163706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMOVZXDQ instruction. 163806c3fb27SDimitry Andric /// 163906c3fb27SDimitry Andric /// \param __V 164006c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the source values. 164106c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the zero-extended 164206c3fb27SDimitry Andric /// values. 16430b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 16440b57cec5SDimitry Andric _mm256_cvtepu32_epi64(__m128i __V) 16450b57cec5SDimitry Andric { 16460b57cec5SDimitry Andric return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); 16470b57cec5SDimitry Andric } 16480b57cec5SDimitry Andric 164906c3fb27SDimitry Andric /// Multiplies signed 32-bit integers from even-numbered elements of two 165006c3fb27SDimitry Andric /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 165106c3fb27SDimitry Andric /// [4 x i64] result. 165206c3fb27SDimitry Andric /// 165306c3fb27SDimitry Andric /// \code{.operation} 165406c3fb27SDimitry Andric /// result[63:0] := __a[31:0] * __b[31:0] 165506c3fb27SDimitry Andric /// result[127:64] := __a[95:64] * __b[95:64] 165606c3fb27SDimitry Andric /// result[191:128] := __a[159:128] * __b[159:128] 165706c3fb27SDimitry Andric /// result[255:192] := __a[223:192] * __b[223:192] 165806c3fb27SDimitry Andric /// \endcode 165906c3fb27SDimitry Andric /// 166006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 166106c3fb27SDimitry Andric /// 166206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULDQ instruction. 166306c3fb27SDimitry Andric /// 166406c3fb27SDimitry Andric /// \param __a 166506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 166606c3fb27SDimitry Andric /// \param __b 166706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 166806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the products. 16690b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 16700b57cec5SDimitry Andric _mm256_mul_epi32(__m256i __a, __m256i __b) 16710b57cec5SDimitry Andric { 16720b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 16730b57cec5SDimitry Andric } 16740b57cec5SDimitry Andric 167506c3fb27SDimitry Andric /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 167606c3fb27SDimitry Andric /// [16 x i16], truncates the 32-bit results to the most significant 18 167706c3fb27SDimitry Andric /// bits, rounds by adding 1, and returns bits [16:1] of each rounded 167806c3fb27SDimitry Andric /// product in the [16 x i16] result. 167906c3fb27SDimitry Andric /// 168006c3fb27SDimitry Andric /// \code{.operation} 168106c3fb27SDimitry Andric /// FOR i := 0 TO 15 168206c3fb27SDimitry Andric /// j := i*16 168306c3fb27SDimitry Andric /// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1 168406c3fb27SDimitry Andric /// result[j+15:j] := temp[16:1] 168506c3fb27SDimitry Andric /// \endcode 168606c3fb27SDimitry Andric /// 168706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 168806c3fb27SDimitry Andric /// 168906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULHRSW instruction. 169006c3fb27SDimitry Andric /// 169106c3fb27SDimitry Andric /// \param __a 169206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 169306c3fb27SDimitry Andric /// \param __b 169406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 169506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the rounded products. 16960b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 16970b57cec5SDimitry Andric _mm256_mulhrs_epi16(__m256i __a, __m256i __b) 16980b57cec5SDimitry Andric { 16990b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 17000b57cec5SDimitry Andric } 17010b57cec5SDimitry Andric 170206c3fb27SDimitry Andric /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of 170306c3fb27SDimitry Andric /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 170406c3fb27SDimitry Andric /// [16 x i16] result. 170506c3fb27SDimitry Andric /// 170606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 170706c3fb27SDimitry Andric /// 170806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULHUW instruction. 170906c3fb27SDimitry Andric /// 171006c3fb27SDimitry Andric /// \param __a 171106c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 171206c3fb27SDimitry Andric /// \param __b 171306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 171406c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the products. 17150b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 17160b57cec5SDimitry Andric _mm256_mulhi_epu16(__m256i __a, __m256i __b) 17170b57cec5SDimitry Andric { 17180b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 17190b57cec5SDimitry Andric } 17200b57cec5SDimitry Andric 172106c3fb27SDimitry Andric /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 172206c3fb27SDimitry Andric /// [16 x i16], and returns the upper 16 bits of each 32-bit product in the 172306c3fb27SDimitry Andric /// [16 x i16] result. 172406c3fb27SDimitry Andric /// 172506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 172606c3fb27SDimitry Andric /// 172706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULHW instruction. 172806c3fb27SDimitry Andric /// 172906c3fb27SDimitry Andric /// \param __a 173006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 173106c3fb27SDimitry Andric /// \param __b 173206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 173306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the products. 17340b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 17350b57cec5SDimitry Andric _mm256_mulhi_epi16(__m256i __a, __m256i __b) 17360b57cec5SDimitry Andric { 17370b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 17380b57cec5SDimitry Andric } 17390b57cec5SDimitry Andric 174006c3fb27SDimitry Andric /// Multiplies signed 16-bit integer elements of two 256-bit vectors of 174106c3fb27SDimitry Andric /// [16 x i16], and returns the lower 16 bits of each 32-bit product in the 174206c3fb27SDimitry Andric /// [16 x i16] result. 174306c3fb27SDimitry Andric /// 174406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 174506c3fb27SDimitry Andric /// 174606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULLW instruction. 174706c3fb27SDimitry Andric /// 174806c3fb27SDimitry Andric /// \param __a 174906c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 175006c3fb27SDimitry Andric /// \param __b 175106c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing one of the source operands. 175206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the products. 17530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 17540b57cec5SDimitry Andric _mm256_mullo_epi16(__m256i __a, __m256i __b) 17550b57cec5SDimitry Andric { 17560b57cec5SDimitry Andric return (__m256i)((__v16hu)__a * (__v16hu)__b); 17570b57cec5SDimitry Andric } 17580b57cec5SDimitry Andric 175906c3fb27SDimitry Andric /// Multiplies signed 32-bit integer elements of two 256-bit vectors of 176006c3fb27SDimitry Andric /// [8 x i32], and returns the lower 32 bits of each 64-bit product in the 176106c3fb27SDimitry Andric /// [8 x i32] result. 176206c3fb27SDimitry Andric /// 176306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 176406c3fb27SDimitry Andric /// 176506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULLD instruction. 176606c3fb27SDimitry Andric /// 176706c3fb27SDimitry Andric /// \param __a 176806c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 176906c3fb27SDimitry Andric /// \param __b 177006c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 177106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the products. 17720b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 17730b57cec5SDimitry Andric _mm256_mullo_epi32 (__m256i __a, __m256i __b) 17740b57cec5SDimitry Andric { 17750b57cec5SDimitry Andric return (__m256i)((__v8su)__a * (__v8su)__b); 17760b57cec5SDimitry Andric } 17770b57cec5SDimitry Andric 177806c3fb27SDimitry Andric /// Multiplies unsigned 32-bit integers from even-numered elements of two 177906c3fb27SDimitry Andric /// 256-bit vectors of [8 x i32] and returns the 64-bit products in the 178006c3fb27SDimitry Andric /// [4 x i64] result. 178106c3fb27SDimitry Andric /// 178206c3fb27SDimitry Andric /// \code{.operation} 178306c3fb27SDimitry Andric /// result[63:0] := __a[31:0] * __b[31:0] 178406c3fb27SDimitry Andric /// result[127:64] := __a[95:64] * __b[95:64] 178506c3fb27SDimitry Andric /// result[191:128] := __a[159:128] * __b[159:128] 178606c3fb27SDimitry Andric /// result[255:192] := __a[223:192] * __b[223:192] 178706c3fb27SDimitry Andric /// \endcode 178806c3fb27SDimitry Andric /// 178906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 179006c3fb27SDimitry Andric /// 179106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMULUDQ instruction. 179206c3fb27SDimitry Andric /// 179306c3fb27SDimitry Andric /// \param __a 179406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 179506c3fb27SDimitry Andric /// \param __b 179606c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing one of the source operands. 179706c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the products. 17980b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 17990b57cec5SDimitry Andric _mm256_mul_epu32(__m256i __a, __m256i __b) 18000b57cec5SDimitry Andric { 18010b57cec5SDimitry Andric return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 18020b57cec5SDimitry Andric } 18030b57cec5SDimitry Andric 180406c3fb27SDimitry Andric /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and 180506c3fb27SDimitry Andric /// \a __b. 180606c3fb27SDimitry Andric /// 180706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 180806c3fb27SDimitry Andric /// 180906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPOR instruction. 181006c3fb27SDimitry Andric /// 181106c3fb27SDimitry Andric /// \param __a 181206c3fb27SDimitry Andric /// A 256-bit integer vector. 181306c3fb27SDimitry Andric /// \param __b 181406c3fb27SDimitry Andric /// A 256-bit integer vector. 181506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 18160b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 18170b57cec5SDimitry Andric _mm256_or_si256(__m256i __a, __m256i __b) 18180b57cec5SDimitry Andric { 18190b57cec5SDimitry Andric return (__m256i)((__v4du)__a | (__v4du)__b); 18200b57cec5SDimitry Andric } 18210b57cec5SDimitry Andric 182206c3fb27SDimitry Andric /// Computes four sum of absolute difference (SAD) operations on sets of eight 182306c3fb27SDimitry Andric /// unsigned 8-bit integers from the 256-bit integer vectors \a __a and 182406c3fb27SDimitry Andric /// \a __b. 182506c3fb27SDimitry Andric /// 182606c3fb27SDimitry Andric /// One SAD result is computed for each set of eight bytes from \a __a and 182706c3fb27SDimitry Andric /// eight bytes from \a __b. The zero-extended SAD value is returned in the 182806c3fb27SDimitry Andric /// corresponding 64-bit element of the result. 182906c3fb27SDimitry Andric /// 183006c3fb27SDimitry Andric /// A single SAD operation takes the differences between the corresponding 183106c3fb27SDimitry Andric /// bytes of \a __a and \a __b, takes the absolute value of each difference, 183206c3fb27SDimitry Andric /// and sums these eight values to form one 16-bit result. This operation 183306c3fb27SDimitry Andric /// is repeated four times with successive sets of eight bytes. 183406c3fb27SDimitry Andric /// 183506c3fb27SDimitry Andric /// \code{.operation} 183606c3fb27SDimitry Andric /// FOR i := 0 TO 3 183706c3fb27SDimitry Andric /// j := i*64 183806c3fb27SDimitry Andric /// temp0 := ABS(__a[j+7:j] - __b[j+7:j]) 183906c3fb27SDimitry Andric /// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8]) 184006c3fb27SDimitry Andric /// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16]) 184106c3fb27SDimitry Andric /// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24]) 184206c3fb27SDimitry Andric /// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32]) 184306c3fb27SDimitry Andric /// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40]) 184406c3fb27SDimitry Andric /// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48]) 184506c3fb27SDimitry Andric /// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56]) 184606c3fb27SDimitry Andric /// result[j+15:j] := temp0 + temp1 + temp2 + temp3 + 184706c3fb27SDimitry Andric /// temp4 + temp5 + temp6 + temp7 184806c3fb27SDimitry Andric /// result[j+63:j+16] := 0 184906c3fb27SDimitry Andric /// ENDFOR 185006c3fb27SDimitry Andric /// \endcode 185106c3fb27SDimitry Andric /// 185206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 185306c3fb27SDimitry Andric /// 185406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSADBW instruction. 185506c3fb27SDimitry Andric /// 185606c3fb27SDimitry Andric /// \param __a 185706c3fb27SDimitry Andric /// A 256-bit integer vector. 185806c3fb27SDimitry Andric /// \param __b 185906c3fb27SDimitry Andric /// A 256-bit integer vector. 186006c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 18610b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 18620b57cec5SDimitry Andric _mm256_sad_epu8(__m256i __a, __m256i __b) 18630b57cec5SDimitry Andric { 18640b57cec5SDimitry Andric return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 18650b57cec5SDimitry Andric } 18660b57cec5SDimitry Andric 186706c3fb27SDimitry Andric /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according 186806c3fb27SDimitry Andric /// to control information in the 256-bit integer vector \a __b, and 186906c3fb27SDimitry Andric /// returns the 256-bit result. In effect there are two separate 128-bit 187006c3fb27SDimitry Andric /// shuffles in the lower and upper halves. 187106c3fb27SDimitry Andric /// 187206c3fb27SDimitry Andric /// \code{.operation} 187306c3fb27SDimitry Andric /// FOR i := 0 TO 31 187406c3fb27SDimitry Andric /// j := i*8 187506c3fb27SDimitry Andric /// IF __b[j+7] == 1 187606c3fb27SDimitry Andric /// result[j+7:j] := 0 187706c3fb27SDimitry Andric /// ELSE 187806c3fb27SDimitry Andric /// k := __b[j+3:j] * 8 187906c3fb27SDimitry Andric /// IF i > 15 188006c3fb27SDimitry Andric /// k := k + 128 188106c3fb27SDimitry Andric /// FI 188206c3fb27SDimitry Andric /// result[j+7:j] := __a[k+7:k] 188306c3fb27SDimitry Andric /// FI 188406c3fb27SDimitry Andric /// ENDFOR 188506c3fb27SDimitry Andric /// \endcode 188606c3fb27SDimitry Andric /// 188706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 188806c3fb27SDimitry Andric /// 188906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFB instruction. 189006c3fb27SDimitry Andric /// 189106c3fb27SDimitry Andric /// \param __a 189206c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 189306c3fb27SDimitry Andric /// \param __b 189406c3fb27SDimitry Andric /// A 256-bit integer vector containing control information to determine 189506c3fb27SDimitry Andric /// what goes into the corresponding byte of the result. If bit 7 of the 189606c3fb27SDimitry Andric /// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the 189706c3fb27SDimitry Andric /// control byte specify the index (within the same 128-bit half) of \a __a 189806c3fb27SDimitry Andric /// to copy to the result byte. 189906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 19000b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 19010b57cec5SDimitry Andric _mm256_shuffle_epi8(__m256i __a, __m256i __b) 19020b57cec5SDimitry Andric { 19030b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 19040b57cec5SDimitry Andric } 19050b57cec5SDimitry Andric 190606c3fb27SDimitry Andric /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a 190706c3fb27SDimitry Andric /// according to control information in the integer literal \a imm, and 190806c3fb27SDimitry Andric /// returns the 256-bit result. In effect there are two parallel 128-bit 190906c3fb27SDimitry Andric /// shuffles in the lower and upper halves. 191006c3fb27SDimitry Andric /// 191106c3fb27SDimitry Andric /// \code{.operation} 191206c3fb27SDimitry Andric /// FOR i := 0 to 3 191306c3fb27SDimitry Andric /// j := i*32 191406c3fb27SDimitry Andric /// k := (imm >> i*2)[1:0] * 32 191506c3fb27SDimitry Andric /// result[j+31:j] := a[k+31:k] 191606c3fb27SDimitry Andric /// result[128+j+31:128+j] := a[128+k+31:128+k] 191706c3fb27SDimitry Andric /// ENDFOR 191806c3fb27SDimitry Andric /// \endcode 191906c3fb27SDimitry Andric /// 192006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 192106c3fb27SDimitry Andric /// 192206c3fb27SDimitry Andric /// \code 192306c3fb27SDimitry Andric /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm); 192406c3fb27SDimitry Andric /// \endcode 192506c3fb27SDimitry Andric /// 192606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFB instruction. 192706c3fb27SDimitry Andric /// 192806c3fb27SDimitry Andric /// \param a 192906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing source values. 193006c3fb27SDimitry Andric /// \param imm 193106c3fb27SDimitry Andric /// An immediate 8-bit value specifying which elements to copy from \a a. 193206c3fb27SDimitry Andric /// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the 193306c3fb27SDimitry Andric /// result, \a imm[3:2] specifies the index for elements 1 and 5, and so 193406c3fb27SDimitry Andric /// forth. 193506c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 19360b57cec5SDimitry Andric #define _mm256_shuffle_epi32(a, imm) \ 1937349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) 19380b57cec5SDimitry Andric 193906c3fb27SDimitry Andric /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a 194006c3fb27SDimitry Andric /// according to control information in the integer literal \a imm, and 194106c3fb27SDimitry Andric /// returns the 256-bit result. The upper 64 bits of each 128-bit half 194206c3fb27SDimitry Andric /// are shuffled in parallel; the lower 64 bits of each 128-bit half are 194306c3fb27SDimitry Andric /// copied from \a a unchanged. 194406c3fb27SDimitry Andric /// 194506c3fb27SDimitry Andric /// \code{.operation} 194606c3fb27SDimitry Andric /// result[63:0] := a[63:0] 194706c3fb27SDimitry Andric /// result[191:128] := a[191:128] 194806c3fb27SDimitry Andric /// FOR i := 0 TO 3 194906c3fb27SDimitry Andric /// j := i * 16 + 64 195006c3fb27SDimitry Andric /// k := (imm >> i*2)[1:0] * 16 + 64 195106c3fb27SDimitry Andric /// result[j+15:j] := a[k+15:k] 195206c3fb27SDimitry Andric /// result[128+j+15:128+j] := a[128+k+15:128+k] 195306c3fb27SDimitry Andric /// ENDFOR 195406c3fb27SDimitry Andric /// \endcode 195506c3fb27SDimitry Andric /// 195606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 195706c3fb27SDimitry Andric /// 195806c3fb27SDimitry Andric /// \code 195906c3fb27SDimitry Andric /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm); 196006c3fb27SDimitry Andric /// \endcode 196106c3fb27SDimitry Andric /// 196206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFHW instruction. 196306c3fb27SDimitry Andric /// 196406c3fb27SDimitry Andric /// \param a 196506c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing source values. 196606c3fb27SDimitry Andric /// \param imm 196706c3fb27SDimitry Andric /// An immediate 8-bit value specifying which elements to copy from \a a. 196806c3fb27SDimitry Andric /// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the 196906c3fb27SDimitry Andric /// result, \a imm[3:2] specifies the index for elements 5 and 9, and so 197006c3fb27SDimitry Andric /// forth. Indexes are offset by 4 (so 0 means index 4, and so forth). 197106c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 19720b57cec5SDimitry Andric #define _mm256_shufflehi_epi16(a, imm) \ 1973349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) 19740b57cec5SDimitry Andric 197506c3fb27SDimitry Andric /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a 197606c3fb27SDimitry Andric /// according to control information in the integer literal \a imm, and 197706c3fb27SDimitry Andric /// returns the 256-bit [16 x i16] result. The lower 64 bits of each 197806c3fb27SDimitry Andric /// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are 197906c3fb27SDimitry Andric /// copied from \a a unchanged. 198006c3fb27SDimitry Andric /// 198106c3fb27SDimitry Andric /// \code{.operation} 198206c3fb27SDimitry Andric /// result[127:64] := a[127:64] 198306c3fb27SDimitry Andric /// result[255:192] := a[255:192] 198406c3fb27SDimitry Andric /// FOR i := 0 TO 3 198506c3fb27SDimitry Andric /// j := i * 16 198606c3fb27SDimitry Andric /// k := (imm >> i*2)[1:0] * 16 198706c3fb27SDimitry Andric /// result[j+15:j] := a[k+15:k] 198806c3fb27SDimitry Andric /// result[128+j+15:128+j] := a[128+k+15:128+k] 198906c3fb27SDimitry Andric /// ENDFOR 199006c3fb27SDimitry Andric /// \endcode 199106c3fb27SDimitry Andric /// 199206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 199306c3fb27SDimitry Andric /// 199406c3fb27SDimitry Andric /// \code 199506c3fb27SDimitry Andric /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm); 199606c3fb27SDimitry Andric /// \endcode 199706c3fb27SDimitry Andric /// 199806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSHUFLW instruction. 199906c3fb27SDimitry Andric /// 200006c3fb27SDimitry Andric /// \param a 200106c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to use as a source of data for the 200206c3fb27SDimitry Andric /// result. 200306c3fb27SDimitry Andric /// \param imm 200406c3fb27SDimitry Andric /// An immediate 8-bit value specifying which elements to copy from \a a. 200506c3fb27SDimitry Andric /// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the 200606c3fb27SDimitry Andric /// result, \a imm[3:2] specifies the index for elements 1 and 9, and so 200706c3fb27SDimitry Andric /// forth. 200806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 20090b57cec5SDimitry Andric #define _mm256_shufflelo_epi16(a, imm) \ 2010349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) 20110b57cec5SDimitry Andric 201206c3fb27SDimitry Andric /// Sets each byte of the result to the corresponding byte of the 256-bit 201306c3fb27SDimitry Andric /// integer vector in \a __a, the negative of that byte, or zero, depending 201406c3fb27SDimitry Andric /// on whether the corresponding byte of the 256-bit integer vector in 201506c3fb27SDimitry Andric /// \a __b is greater than zero, less than zero, or equal to zero, 201606c3fb27SDimitry Andric /// respectively. 201706c3fb27SDimitry Andric /// 201806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 201906c3fb27SDimitry Andric /// 202006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSIGNB instruction. 202106c3fb27SDimitry Andric /// 202206c3fb27SDimitry Andric /// \param __a 202306c3fb27SDimitry Andric /// A 256-bit integer vector. 202406c3fb27SDimitry Andric /// \param __b 202506c3fb27SDimitry Andric /// A 256-bit integer vector]. 202606c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 20270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 20280b57cec5SDimitry Andric _mm256_sign_epi8(__m256i __a, __m256i __b) 20290b57cec5SDimitry Andric { 20300b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 20310b57cec5SDimitry Andric } 20320b57cec5SDimitry Andric 203306c3fb27SDimitry Andric /// Sets each element of the result to the corresponding element of the 203406c3fb27SDimitry Andric /// 256-bit vector of [16 x i16] in \a __a, the negative of that element, 203506c3fb27SDimitry Andric /// or zero, depending on whether the corresponding element of the 256-bit 203606c3fb27SDimitry Andric /// vector of [16 x i16] in \a __b is greater than zero, less than zero, or 203706c3fb27SDimitry Andric /// equal to zero, respectively. 203806c3fb27SDimitry Andric /// 203906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 204006c3fb27SDimitry Andric /// 204106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSIGNW instruction. 204206c3fb27SDimitry Andric /// 204306c3fb27SDimitry Andric /// \param __a 204406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 204506c3fb27SDimitry Andric /// \param __b 204606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16]. 204706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 20480b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 20490b57cec5SDimitry Andric _mm256_sign_epi16(__m256i __a, __m256i __b) 20500b57cec5SDimitry Andric { 20510b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 20520b57cec5SDimitry Andric } 20530b57cec5SDimitry Andric 205406c3fb27SDimitry Andric /// Sets each element of the result to the corresponding element of the 205506c3fb27SDimitry Andric /// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or 205606c3fb27SDimitry Andric /// zero, depending on whether the corresponding element of the 256-bit 205706c3fb27SDimitry Andric /// vector of [8 x i32] in \a __b is greater than zero, less than zero, or 205806c3fb27SDimitry Andric /// equal to zero, respectively. 205906c3fb27SDimitry Andric /// 206006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 206106c3fb27SDimitry Andric /// 206206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSIGND instruction. 206306c3fb27SDimitry Andric /// 206406c3fb27SDimitry Andric /// \param __a 206506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 206606c3fb27SDimitry Andric /// \param __b 206706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32]. 206806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 20690b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 20700b57cec5SDimitry Andric _mm256_sign_epi32(__m256i __a, __m256i __b) 20710b57cec5SDimitry Andric { 20720b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 20730b57cec5SDimitry Andric } 20740b57cec5SDimitry Andric 207506c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 207606c3fb27SDimitry Andric /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 207706c3fb27SDimitry Andric /// is greater than 15, the returned result is all zeroes. 207806c3fb27SDimitry Andric /// 207906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 208006c3fb27SDimitry Andric /// 208106c3fb27SDimitry Andric /// \code 208206c3fb27SDimitry Andric /// __m256i _mm256_slli_si256(__m256i a, const int imm); 208306c3fb27SDimitry Andric /// \endcode 208406c3fb27SDimitry Andric /// 208506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLDQ instruction. 208606c3fb27SDimitry Andric /// 208706c3fb27SDimitry Andric /// \param a 208806c3fb27SDimitry Andric /// A 256-bit integer vector to be shifted. 208906c3fb27SDimitry Andric /// \param imm 209006c3fb27SDimitry Andric /// An unsigned immediate value specifying the shift count (in bytes). 209106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 20920b57cec5SDimitry Andric #define _mm256_slli_si256(a, imm) \ 2093349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 20940b57cec5SDimitry Andric 209506c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector \a a left by 209606c3fb27SDimitry Andric /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm 209706c3fb27SDimitry Andric /// is greater than 15, the returned result is all zeroes. 209806c3fb27SDimitry Andric /// 209906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 210006c3fb27SDimitry Andric /// 210106c3fb27SDimitry Andric /// \code 210206c3fb27SDimitry Andric /// __m256i _mm256_bslli_epi128(__m256i a, const int imm); 210306c3fb27SDimitry Andric /// \endcode 210406c3fb27SDimitry Andric /// 210506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLDQ instruction. 210606c3fb27SDimitry Andric /// 210706c3fb27SDimitry Andric /// \param a 210806c3fb27SDimitry Andric /// A 256-bit integer vector to be shifted. 210906c3fb27SDimitry Andric /// \param imm 211006c3fb27SDimitry Andric /// An unsigned immediate value specifying the shift count (in bytes). 211106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 21120b57cec5SDimitry Andric #define _mm256_bslli_epi128(a, imm) \ 2113349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) 21140b57cec5SDimitry Andric 211506c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 211606c3fb27SDimitry Andric /// left by \a __count bits, shifting in zero bits, and returns the result. 211706c3fb27SDimitry Andric /// If \a __count is greater than 15, the returned result is all zeroes. 211806c3fb27SDimitry Andric /// 211906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 212006c3fb27SDimitry Andric /// 212106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLW instruction. 212206c3fb27SDimitry Andric /// 212306c3fb27SDimitry Andric /// \param __a 212406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to be shifted. 212506c3fb27SDimitry Andric /// \param __count 212606c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 212706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 21280b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 21290b57cec5SDimitry Andric _mm256_slli_epi16(__m256i __a, int __count) 21300b57cec5SDimitry Andric { 21310b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 21320b57cec5SDimitry Andric } 21330b57cec5SDimitry Andric 213406c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 213506c3fb27SDimitry Andric /// left by the number of bits specified by the lower 64 bits of \a __count, 213606c3fb27SDimitry Andric /// shifting in zero bits, and returns the result. If \a __count is greater 213706c3fb27SDimitry Andric /// than 15, the returned result is all zeroes. 213806c3fb27SDimitry Andric /// 213906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 214006c3fb27SDimitry Andric /// 214106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLW instruction. 214206c3fb27SDimitry Andric /// 214306c3fb27SDimitry Andric /// \param __a 214406c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to be shifted. 214506c3fb27SDimitry Andric /// \param __count 214606c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 214706c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 214806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 21490b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 21500b57cec5SDimitry Andric _mm256_sll_epi16(__m256i __a, __m128i __count) 21510b57cec5SDimitry Andric { 21520b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 21530b57cec5SDimitry Andric } 21540b57cec5SDimitry Andric 215506c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 215606c3fb27SDimitry Andric /// left by \a __count bits, shifting in zero bits, and returns the result. 215706c3fb27SDimitry Andric /// If \a __count is greater than 31, the returned result is all zeroes. 215806c3fb27SDimitry Andric /// 215906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 216006c3fb27SDimitry Andric /// 216106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLD instruction. 216206c3fb27SDimitry Andric /// 216306c3fb27SDimitry Andric /// \param __a 216406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 216506c3fb27SDimitry Andric /// \param __count 216606c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 216706c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 21680b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 21690b57cec5SDimitry Andric _mm256_slli_epi32(__m256i __a, int __count) 21700b57cec5SDimitry Andric { 21710b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 21720b57cec5SDimitry Andric } 21730b57cec5SDimitry Andric 217406c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 217506c3fb27SDimitry Andric /// left by the number of bits given in the lower 64 bits of \a __count, 217606c3fb27SDimitry Andric /// shifting in zero bits, and returns the result. If \a __count is greater 217706c3fb27SDimitry Andric /// than 31, the returned result is all zeroes. 217806c3fb27SDimitry Andric /// 217906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 218006c3fb27SDimitry Andric /// 218106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLD instruction. 218206c3fb27SDimitry Andric /// 218306c3fb27SDimitry Andric /// \param __a 218406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 218506c3fb27SDimitry Andric /// \param __count 218606c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 218706c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 218806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 21890b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 21900b57cec5SDimitry Andric _mm256_sll_epi32(__m256i __a, __m128i __count) 21910b57cec5SDimitry Andric { 21920b57cec5SDimitry Andric return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 21930b57cec5SDimitry Andric } 21940b57cec5SDimitry Andric 219506c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 219606c3fb27SDimitry Andric /// left by \a __count bits, shifting in zero bits, and returns the result. 219706c3fb27SDimitry Andric /// If \a __count is greater than 63, the returned result is all zeroes. 219806c3fb27SDimitry Andric /// 219906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 220006c3fb27SDimitry Andric /// 220106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLQ instruction. 220206c3fb27SDimitry Andric /// 220306c3fb27SDimitry Andric /// \param __a 220406c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] to be shifted. 220506c3fb27SDimitry Andric /// \param __count 220606c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 220706c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 22080b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 22090b57cec5SDimitry Andric _mm256_slli_epi64(__m256i __a, int __count) 22100b57cec5SDimitry Andric { 22110b57cec5SDimitry Andric return __builtin_ia32_psllqi256((__v4di)__a, __count); 22120b57cec5SDimitry Andric } 22130b57cec5SDimitry Andric 221406c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 221506c3fb27SDimitry Andric /// left by the number of bits given in the lower 64 bits of \a __count, 221606c3fb27SDimitry Andric /// shifting in zero bits, and returns the result. If \a __count is greater 221706c3fb27SDimitry Andric /// than 63, the returned result is all zeroes. 221806c3fb27SDimitry Andric /// 221906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 222006c3fb27SDimitry Andric /// 222106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLQ instruction. 222206c3fb27SDimitry Andric /// 222306c3fb27SDimitry Andric /// \param __a 222406c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] to be shifted. 222506c3fb27SDimitry Andric /// \param __count 222606c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 222706c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 222806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 22290b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 22300b57cec5SDimitry Andric _mm256_sll_epi64(__m256i __a, __m128i __count) 22310b57cec5SDimitry Andric { 22320b57cec5SDimitry Andric return __builtin_ia32_psllq256((__v4di)__a, __count); 22330b57cec5SDimitry Andric } 22340b57cec5SDimitry Andric 223506c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 223606c3fb27SDimitry Andric /// right by \a __count bits, shifting in sign bits, and returns the result. 223706c3fb27SDimitry Andric /// If \a __count is greater than 15, each element of the result is either 223806c3fb27SDimitry Andric /// 0 or -1 according to the corresponding input sign bit. 223906c3fb27SDimitry Andric /// 224006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 224106c3fb27SDimitry Andric /// 224206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAW instruction. 224306c3fb27SDimitry Andric /// 224406c3fb27SDimitry Andric /// \param __a 224506c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to be shifted. 224606c3fb27SDimitry Andric /// \param __count 224706c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 224806c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 22490b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 22500b57cec5SDimitry Andric _mm256_srai_epi16(__m256i __a, int __count) 22510b57cec5SDimitry Andric { 22520b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 22530b57cec5SDimitry Andric } 22540b57cec5SDimitry Andric 225506c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 225606c3fb27SDimitry Andric /// right by the number of bits given in the lower 64 bits of \a __count, 225706c3fb27SDimitry Andric /// shifting in sign bits, and returns the result. If \a __count is greater 225806c3fb27SDimitry Andric /// than 15, each element of the result is either 0 or -1 according to the 225906c3fb27SDimitry Andric /// corresponding input sign bit. 226006c3fb27SDimitry Andric /// 226106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 226206c3fb27SDimitry Andric /// 226306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAW instruction. 226406c3fb27SDimitry Andric /// 226506c3fb27SDimitry Andric /// \param __a 226606c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to be shifted. 226706c3fb27SDimitry Andric /// \param __count 226806c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 226906c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 227006c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 22710b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 22720b57cec5SDimitry Andric _mm256_sra_epi16(__m256i __a, __m128i __count) 22730b57cec5SDimitry Andric { 22740b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 22750b57cec5SDimitry Andric } 22760b57cec5SDimitry Andric 227706c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 227806c3fb27SDimitry Andric /// right by \a __count bits, shifting in sign bits, and returns the result. 227906c3fb27SDimitry Andric /// If \a __count is greater than 31, each element of the result is either 228006c3fb27SDimitry Andric /// 0 or -1 according to the corresponding input sign bit. 228106c3fb27SDimitry Andric /// 228206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 228306c3fb27SDimitry Andric /// 228406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAD instruction. 228506c3fb27SDimitry Andric /// 228606c3fb27SDimitry Andric /// \param __a 228706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 228806c3fb27SDimitry Andric /// \param __count 228906c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 229006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 22910b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 22920b57cec5SDimitry Andric _mm256_srai_epi32(__m256i __a, int __count) 22930b57cec5SDimitry Andric { 22940b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 22950b57cec5SDimitry Andric } 22960b57cec5SDimitry Andric 229706c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 229806c3fb27SDimitry Andric /// right by the number of bits given in the lower 64 bits of \a __count, 229906c3fb27SDimitry Andric /// shifting in sign bits, and returns the result. If \a __count is greater 230006c3fb27SDimitry Andric /// than 31, each element of the result is either 0 or -1 according to the 230106c3fb27SDimitry Andric /// corresponding input sign bit. 230206c3fb27SDimitry Andric /// 230306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 230406c3fb27SDimitry Andric /// 230506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAD instruction. 230606c3fb27SDimitry Andric /// 230706c3fb27SDimitry Andric /// \param __a 230806c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 230906c3fb27SDimitry Andric /// \param __count 231006c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 231106c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 231206c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 23130b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 23140b57cec5SDimitry Andric _mm256_sra_epi32(__m256i __a, __m128i __count) 23150b57cec5SDimitry Andric { 23160b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 23170b57cec5SDimitry Andric } 23180b57cec5SDimitry Andric 231906c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 232006c3fb27SDimitry Andric /// \a imm bytes, shifting in zero bytes, and returns the result. If 232106c3fb27SDimitry Andric /// \a imm is greater than 15, the returned result is all zeroes. 232206c3fb27SDimitry Andric /// 232306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 232406c3fb27SDimitry Andric /// 232506c3fb27SDimitry Andric /// \code 232606c3fb27SDimitry Andric /// __m256i _mm256_srli_si256(__m256i a, const int imm); 232706c3fb27SDimitry Andric /// \endcode 232806c3fb27SDimitry Andric /// 232906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLDQ instruction. 233006c3fb27SDimitry Andric /// 233106c3fb27SDimitry Andric /// \param a 233206c3fb27SDimitry Andric /// A 256-bit integer vector to be shifted. 233306c3fb27SDimitry Andric /// \param imm 233406c3fb27SDimitry Andric /// An unsigned immediate value specifying the shift count (in bytes). 233506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 23360b57cec5SDimitry Andric #define _mm256_srli_si256(a, imm) \ 2337349cc55cSDimitry Andric ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 23380b57cec5SDimitry Andric 233906c3fb27SDimitry Andric /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by 234006c3fb27SDimitry Andric /// \a imm bytes, shifting in zero bytes, and returns the result. If 234106c3fb27SDimitry Andric /// \a imm is greater than 15, the returned result is all zeroes. 234206c3fb27SDimitry Andric /// 234306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 234406c3fb27SDimitry Andric /// 234506c3fb27SDimitry Andric /// \code 234606c3fb27SDimitry Andric /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm); 234706c3fb27SDimitry Andric /// \endcode 234806c3fb27SDimitry Andric /// 234906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLDQ instruction. 235006c3fb27SDimitry Andric /// 235106c3fb27SDimitry Andric /// \param a 235206c3fb27SDimitry Andric /// A 256-bit integer vector to be shifted. 235306c3fb27SDimitry Andric /// \param imm 235406c3fb27SDimitry Andric /// An unsigned immediate value specifying the shift count (in bytes). 235506c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 23560b57cec5SDimitry Andric #define _mm256_bsrli_epi128(a, imm) \ 2357349cc55cSDimitry Andric ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) 23580b57cec5SDimitry Andric 235906c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 236006c3fb27SDimitry Andric /// right by \a __count bits, shifting in zero bits, and returns the result. 236106c3fb27SDimitry Andric /// If \a __count is greater than 15, the returned result is all zeroes. 236206c3fb27SDimitry Andric /// 236306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 236406c3fb27SDimitry Andric /// 236506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLW instruction. 236606c3fb27SDimitry Andric /// 236706c3fb27SDimitry Andric /// \param __a 236806c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to be shifted. 236906c3fb27SDimitry Andric /// \param __count 237006c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 237106c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 23720b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 23730b57cec5SDimitry Andric _mm256_srli_epi16(__m256i __a, int __count) 23740b57cec5SDimitry Andric { 23750b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 23760b57cec5SDimitry Andric } 23770b57cec5SDimitry Andric 237806c3fb27SDimitry Andric /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a 237906c3fb27SDimitry Andric /// right by the number of bits given in the lower 64 bits of \a __count, 238006c3fb27SDimitry Andric /// shifting in zero bits, and returns the result. If \a __count is greater 238106c3fb27SDimitry Andric /// than 15, the returned result is all zeroes. 238206c3fb27SDimitry Andric /// 238306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 238406c3fb27SDimitry Andric /// 238506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLW instruction. 238606c3fb27SDimitry Andric /// 238706c3fb27SDimitry Andric /// \param __a 238806c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] to be shifted. 238906c3fb27SDimitry Andric /// \param __count 239006c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 239106c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 239206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 23930b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 23940b57cec5SDimitry Andric _mm256_srl_epi16(__m256i __a, __m128i __count) 23950b57cec5SDimitry Andric { 23960b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 23970b57cec5SDimitry Andric } 23980b57cec5SDimitry Andric 239906c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 240006c3fb27SDimitry Andric /// right by \a __count bits, shifting in zero bits, and returns the result. 240106c3fb27SDimitry Andric /// If \a __count is greater than 31, the returned result is all zeroes. 240206c3fb27SDimitry Andric /// 240306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 240406c3fb27SDimitry Andric /// 240506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLD instruction. 240606c3fb27SDimitry Andric /// 240706c3fb27SDimitry Andric /// \param __a 240806c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 240906c3fb27SDimitry Andric /// \param __count 241006c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 241106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 24120b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 24130b57cec5SDimitry Andric _mm256_srli_epi32(__m256i __a, int __count) 24140b57cec5SDimitry Andric { 24150b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 24160b57cec5SDimitry Andric } 24170b57cec5SDimitry Andric 241806c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a 241906c3fb27SDimitry Andric /// right by the number of bits given in the lower 64 bits of \a __count, 242006c3fb27SDimitry Andric /// shifting in zero bits, and returns the result. If \a __count is greater 242106c3fb27SDimitry Andric /// than 31, the returned result is all zeroes. 242206c3fb27SDimitry Andric /// 242306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 242406c3fb27SDimitry Andric /// 242506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLD instruction. 242606c3fb27SDimitry Andric /// 242706c3fb27SDimitry Andric /// \param __a 242806c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 242906c3fb27SDimitry Andric /// \param __count 243006c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 243106c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 243206c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 24330b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 24340b57cec5SDimitry Andric _mm256_srl_epi32(__m256i __a, __m128i __count) 24350b57cec5SDimitry Andric { 24360b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 24370b57cec5SDimitry Andric } 24380b57cec5SDimitry Andric 243906c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 244006c3fb27SDimitry Andric /// right by \a __count bits, shifting in zero bits, and returns the result. 244106c3fb27SDimitry Andric /// If \a __count is greater than 63, the returned result is all zeroes. 244206c3fb27SDimitry Andric /// 244306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 244406c3fb27SDimitry Andric /// 244506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLQ instruction. 244606c3fb27SDimitry Andric /// 244706c3fb27SDimitry Andric /// \param __a 244806c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] to be shifted. 244906c3fb27SDimitry Andric /// \param __count 245006c3fb27SDimitry Andric /// An unsigned integer value specifying the shift count (in bits). 245106c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 24520b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 24530b57cec5SDimitry Andric _mm256_srli_epi64(__m256i __a, int __count) 24540b57cec5SDimitry Andric { 24550b57cec5SDimitry Andric return __builtin_ia32_psrlqi256((__v4di)__a, __count); 24560b57cec5SDimitry Andric } 24570b57cec5SDimitry Andric 245806c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a 245906c3fb27SDimitry Andric /// right by the number of bits given in the lower 64 bits of \a __count, 246006c3fb27SDimitry Andric /// shifting in zero bits, and returns the result. If \a __count is greater 246106c3fb27SDimitry Andric /// than 63, the returned result is all zeroes. 246206c3fb27SDimitry Andric /// 246306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 246406c3fb27SDimitry Andric /// 246506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLQ instruction. 246606c3fb27SDimitry Andric /// 246706c3fb27SDimitry Andric /// \param __a 246806c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] to be shifted. 246906c3fb27SDimitry Andric /// \param __count 247006c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned 247106c3fb27SDimitry Andric /// shift count (in bits). The upper element is ignored. 247206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 24730b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 24740b57cec5SDimitry Andric _mm256_srl_epi64(__m256i __a, __m128i __count) 24750b57cec5SDimitry Andric { 24760b57cec5SDimitry Andric return __builtin_ia32_psrlq256((__v4di)__a, __count); 24770b57cec5SDimitry Andric } 24780b57cec5SDimitry Andric 247906c3fb27SDimitry Andric /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 248006c3fb27SDimitry Andric /// vectors. Returns the lower 8 bits of each difference in the 248106c3fb27SDimitry Andric /// corresponding byte of the 256-bit integer vector result (overflow is 248206c3fb27SDimitry Andric /// ignored). 248306c3fb27SDimitry Andric /// 248406c3fb27SDimitry Andric /// \code{.operation} 248506c3fb27SDimitry Andric /// FOR i := 0 TO 31 248606c3fb27SDimitry Andric /// j := i*8 248706c3fb27SDimitry Andric /// result[j+7:j] := __a[j+7:j] - __b[j+7:j] 248806c3fb27SDimitry Andric /// ENDFOR 248906c3fb27SDimitry Andric /// \endcode 249006c3fb27SDimitry Andric /// 249106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 249206c3fb27SDimitry Andric /// 249306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBB instruction. 249406c3fb27SDimitry Andric /// 249506c3fb27SDimitry Andric /// \param __a 249606c3fb27SDimitry Andric /// A 256-bit integer vector containing the minuends. 249706c3fb27SDimitry Andric /// \param __b 249806c3fb27SDimitry Andric /// A 256-bit integer vector containing the subtrahends. 249906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the differences. 25000b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 25010b57cec5SDimitry Andric _mm256_sub_epi8(__m256i __a, __m256i __b) 25020b57cec5SDimitry Andric { 25030b57cec5SDimitry Andric return (__m256i)((__v32qu)__a - (__v32qu)__b); 25040b57cec5SDimitry Andric } 25050b57cec5SDimitry Andric 250606c3fb27SDimitry Andric /// Subtracts 16-bit integers from corresponding elements of two 256-bit 250706c3fb27SDimitry Andric /// vectors of [16 x i16]. Returns the lower 16 bits of each difference in 250806c3fb27SDimitry Andric /// the corresponding element of the [16 x i16] result (overflow is 250906c3fb27SDimitry Andric /// ignored). 251006c3fb27SDimitry Andric /// 251106c3fb27SDimitry Andric /// \code{.operation} 251206c3fb27SDimitry Andric /// FOR i := 0 TO 15 251306c3fb27SDimitry Andric /// j := i*16 251406c3fb27SDimitry Andric /// result[j+15:j] := __a[j+15:j] - __b[j+15:j] 251506c3fb27SDimitry Andric /// ENDFOR 251606c3fb27SDimitry Andric /// \endcode 251706c3fb27SDimitry Andric /// 251806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 251906c3fb27SDimitry Andric /// 252006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBW instruction. 252106c3fb27SDimitry Andric /// 252206c3fb27SDimitry Andric /// \param __a 252306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing the minuends. 252406c3fb27SDimitry Andric /// \param __b 252506c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing the subtrahends. 252606c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences. 25270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 25280b57cec5SDimitry Andric _mm256_sub_epi16(__m256i __a, __m256i __b) 25290b57cec5SDimitry Andric { 25300b57cec5SDimitry Andric return (__m256i)((__v16hu)__a - (__v16hu)__b); 25310b57cec5SDimitry Andric } 25320b57cec5SDimitry Andric 253306c3fb27SDimitry Andric /// Subtracts 32-bit integers from corresponding elements of two 256-bit 253406c3fb27SDimitry Andric /// vectors of [8 x i32]. Returns the lower 32 bits of each difference in 253506c3fb27SDimitry Andric /// the corresponding element of the [8 x i32] result (overflow is ignored). 253606c3fb27SDimitry Andric /// 253706c3fb27SDimitry Andric /// \code{.operation} 253806c3fb27SDimitry Andric /// FOR i := 0 TO 7 253906c3fb27SDimitry Andric /// j := i*32 254006c3fb27SDimitry Andric /// result[j+31:j] := __a[j+31:j] - __b[j+31:j] 254106c3fb27SDimitry Andric /// ENDFOR 254206c3fb27SDimitry Andric /// \endcode 254306c3fb27SDimitry Andric /// 254406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 254506c3fb27SDimitry Andric /// 254606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBD instruction. 254706c3fb27SDimitry Andric /// 254806c3fb27SDimitry Andric /// \param __a 254906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the minuends. 255006c3fb27SDimitry Andric /// \param __b 255106c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the subtrahends. 255206c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the differences. 25530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 25540b57cec5SDimitry Andric _mm256_sub_epi32(__m256i __a, __m256i __b) 25550b57cec5SDimitry Andric { 25560b57cec5SDimitry Andric return (__m256i)((__v8su)__a - (__v8su)__b); 25570b57cec5SDimitry Andric } 25580b57cec5SDimitry Andric 255906c3fb27SDimitry Andric /// Subtracts 64-bit integers from corresponding elements of two 256-bit 256006c3fb27SDimitry Andric /// vectors of [4 x i64]. Returns the lower 64 bits of each difference in 256106c3fb27SDimitry Andric /// the corresponding element of the [4 x i64] result (overflow is ignored). 256206c3fb27SDimitry Andric /// 256306c3fb27SDimitry Andric /// \code{.operation} 256406c3fb27SDimitry Andric /// FOR i := 0 TO 3 256506c3fb27SDimitry Andric /// j := i*64 256606c3fb27SDimitry Andric /// result[j+63:j] := __a[j+63:j] - __b[j+63:j] 256706c3fb27SDimitry Andric /// ENDFOR 256806c3fb27SDimitry Andric /// \endcode 256906c3fb27SDimitry Andric /// 257006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 257106c3fb27SDimitry Andric /// 257206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBQ instruction. 257306c3fb27SDimitry Andric /// 257406c3fb27SDimitry Andric /// \param __a 257506c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the minuends. 257606c3fb27SDimitry Andric /// \param __b 257706c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the subtrahends. 257806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the differences. 25790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 25800b57cec5SDimitry Andric _mm256_sub_epi64(__m256i __a, __m256i __b) 25810b57cec5SDimitry Andric { 25820b57cec5SDimitry Andric return (__m256i)((__v4du)__a - (__v4du)__b); 25830b57cec5SDimitry Andric } 25840b57cec5SDimitry Andric 258506c3fb27SDimitry Andric /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 258606c3fb27SDimitry Andric /// vectors using signed saturation, and returns each differences in the 258706c3fb27SDimitry Andric /// corresponding byte of the 256-bit integer vector result. 258806c3fb27SDimitry Andric /// 258906c3fb27SDimitry Andric /// \code{.operation} 259006c3fb27SDimitry Andric /// FOR i := 0 TO 31 259106c3fb27SDimitry Andric /// j := i*8 259206c3fb27SDimitry Andric /// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j]) 259306c3fb27SDimitry Andric /// ENDFOR 259406c3fb27SDimitry Andric /// \endcode 259506c3fb27SDimitry Andric /// 259606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 259706c3fb27SDimitry Andric /// 259806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBSB instruction. 259906c3fb27SDimitry Andric /// 260006c3fb27SDimitry Andric /// \param __a 260106c3fb27SDimitry Andric /// A 256-bit integer vector containing the minuends. 260206c3fb27SDimitry Andric /// \param __b 260306c3fb27SDimitry Andric /// A 256-bit integer vector containing the subtrahends. 260406c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the differences. 26050b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 26060b57cec5SDimitry Andric _mm256_subs_epi8(__m256i __a, __m256i __b) 26070b57cec5SDimitry Andric { 260881ad6265SDimitry Andric return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); 26090b57cec5SDimitry Andric } 26100b57cec5SDimitry Andric 261106c3fb27SDimitry Andric /// Subtracts 16-bit integers from corresponding elements of two 256-bit 261206c3fb27SDimitry Andric /// vectors of [16 x i16] using signed saturation, and returns each 261306c3fb27SDimitry Andric /// difference in the corresponding element of the [16 x i16] result. 261406c3fb27SDimitry Andric /// 261506c3fb27SDimitry Andric /// \code{.operation} 261606c3fb27SDimitry Andric /// FOR i := 0 TO 15 261706c3fb27SDimitry Andric /// j := i*16 261806c3fb27SDimitry Andric /// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j]) 261906c3fb27SDimitry Andric /// ENDFOR 262006c3fb27SDimitry Andric /// \endcode 262106c3fb27SDimitry Andric /// 262206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 262306c3fb27SDimitry Andric /// 262406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBSW instruction. 262506c3fb27SDimitry Andric /// 262606c3fb27SDimitry Andric /// \param __a 262706c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing the minuends. 262806c3fb27SDimitry Andric /// \param __b 262906c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing the subtrahends. 263006c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences. 26310b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 26320b57cec5SDimitry Andric _mm256_subs_epi16(__m256i __a, __m256i __b) 26330b57cec5SDimitry Andric { 263481ad6265SDimitry Andric return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); 26350b57cec5SDimitry Andric } 26360b57cec5SDimitry Andric 263706c3fb27SDimitry Andric /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer 263806c3fb27SDimitry Andric /// vectors using unsigned saturation, and returns each difference in the 263906c3fb27SDimitry Andric /// corresponding byte of the 256-bit integer vector result. For each byte, 264006c3fb27SDimitry Andric /// computes <c> result = __a - __b </c>. 264106c3fb27SDimitry Andric /// 264206c3fb27SDimitry Andric /// \code{.operation} 264306c3fb27SDimitry Andric /// FOR i := 0 TO 31 264406c3fb27SDimitry Andric /// j := i*8 264506c3fb27SDimitry Andric /// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j]) 264606c3fb27SDimitry Andric /// ENDFOR 264706c3fb27SDimitry Andric /// \endcode 264806c3fb27SDimitry Andric /// 264906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 265006c3fb27SDimitry Andric /// 265106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBUSB instruction. 265206c3fb27SDimitry Andric /// 265306c3fb27SDimitry Andric /// \param __a 265406c3fb27SDimitry Andric /// A 256-bit integer vector containing the minuends. 265506c3fb27SDimitry Andric /// \param __b 265606c3fb27SDimitry Andric /// A 256-bit integer vector containing the subtrahends. 265706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the differences. 26580b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 26590b57cec5SDimitry Andric _mm256_subs_epu8(__m256i __a, __m256i __b) 26600b57cec5SDimitry Andric { 266181ad6265SDimitry Andric return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); 26620b57cec5SDimitry Andric } 26630b57cec5SDimitry Andric 266406c3fb27SDimitry Andric /// Subtracts 16-bit integers from corresponding elements of two 256-bit 266506c3fb27SDimitry Andric /// vectors of [16 x i16] using unsigned saturation, and returns each 266606c3fb27SDimitry Andric /// difference in the corresponding element of the [16 x i16] result. 266706c3fb27SDimitry Andric /// 266806c3fb27SDimitry Andric /// \code{.operation} 266906c3fb27SDimitry Andric /// FOR i := 0 TO 15 267006c3fb27SDimitry Andric /// j := i*16 267106c3fb27SDimitry Andric /// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j]) 267206c3fb27SDimitry Andric /// ENDFOR 267306c3fb27SDimitry Andric /// \endcode 267406c3fb27SDimitry Andric /// 267506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 267606c3fb27SDimitry Andric /// 267706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSUBUSW instruction. 267806c3fb27SDimitry Andric /// 267906c3fb27SDimitry Andric /// \param __a 268006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing the minuends. 268106c3fb27SDimitry Andric /// \param __b 268206c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] containing the subtrahends. 268306c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the differences. 26840b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 26850b57cec5SDimitry Andric _mm256_subs_epu16(__m256i __a, __m256i __b) 26860b57cec5SDimitry Andric { 268781ad6265SDimitry Andric return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); 26880b57cec5SDimitry Andric } 26890b57cec5SDimitry Andric 269006c3fb27SDimitry Andric /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 269106c3fb27SDimitry Andric /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 269206c3fb27SDimitry Andric /// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as 269306c3fb27SDimitry Andric /// input; other bits in these parameters are ignored. 269406c3fb27SDimitry Andric /// 269506c3fb27SDimitry Andric /// \code{.operation} 269606c3fb27SDimitry Andric /// result[7:0] := __a[71:64] 269706c3fb27SDimitry Andric /// result[15:8] := __b[71:64] 269806c3fb27SDimitry Andric /// result[23:16] := __a[79:72] 269906c3fb27SDimitry Andric /// result[31:24] := __b[79:72] 270006c3fb27SDimitry Andric /// . . . 270106c3fb27SDimitry Andric /// result[127:120] := __b[127:120] 270206c3fb27SDimitry Andric /// result[135:128] := __a[199:192] 270306c3fb27SDimitry Andric /// . . . 270406c3fb27SDimitry Andric /// result[255:248] := __b[255:248] 270506c3fb27SDimitry Andric /// \endcode 270606c3fb27SDimitry Andric /// 270706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 270806c3fb27SDimitry Andric /// 270906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHBW instruction. 271006c3fb27SDimitry Andric /// 271106c3fb27SDimitry Andric /// \param __a 271206c3fb27SDimitry Andric /// A 256-bit integer vector used as the source for the even-numbered bytes 271306c3fb27SDimitry Andric /// of the result. 271406c3fb27SDimitry Andric /// \param __b 271506c3fb27SDimitry Andric /// A 256-bit integer vector used as the source for the odd-numbered bytes 271606c3fb27SDimitry Andric /// of the result. 271706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 27180b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 27190b57cec5SDimitry Andric _mm256_unpackhi_epi8(__m256i __a, __m256i __b) 27200b57cec5SDimitry Andric { 27210b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 27220b57cec5SDimitry Andric } 27230b57cec5SDimitry Andric 272406c3fb27SDimitry Andric /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 272506c3fb27SDimitry Andric /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 272606c3fb27SDimitry Andric /// vector of [16 x i16]. Specifically, uses the upper 64 bits of each 272706c3fb27SDimitry Andric /// 128-bit half of \a __a and \a __b as input; other bits in these 272806c3fb27SDimitry Andric /// parameters are ignored. 272906c3fb27SDimitry Andric /// 273006c3fb27SDimitry Andric /// \code{.operation} 273106c3fb27SDimitry Andric /// result[15:0] := __a[79:64] 273206c3fb27SDimitry Andric /// result[31:16] := __b[79:64] 273306c3fb27SDimitry Andric /// result[47:32] := __a[95:80] 273406c3fb27SDimitry Andric /// result[63:48] := __b[95:80] 273506c3fb27SDimitry Andric /// . . . 273606c3fb27SDimitry Andric /// result[127:112] := __b[127:112] 273706c3fb27SDimitry Andric /// result[143:128] := __a[211:196] 273806c3fb27SDimitry Andric /// . . . 273906c3fb27SDimitry Andric /// result[255:240] := __b[255:240] 274006c3fb27SDimitry Andric /// \endcode 274106c3fb27SDimitry Andric /// 274206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 274306c3fb27SDimitry Andric /// 274406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHWD instruction. 274506c3fb27SDimitry Andric /// 274606c3fb27SDimitry Andric /// \param __a 274706c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 274806c3fb27SDimitry Andric /// elements of the result. 274906c3fb27SDimitry Andric /// \param __b 275006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 275106c3fb27SDimitry Andric /// elements of the result. 275206c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 27530b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 27540b57cec5SDimitry Andric _mm256_unpackhi_epi16(__m256i __a, __m256i __b) 27550b57cec5SDimitry Andric { 27560b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 27570b57cec5SDimitry Andric } 27580b57cec5SDimitry Andric 275906c3fb27SDimitry Andric /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 276006c3fb27SDimitry Andric /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 276106c3fb27SDimitry Andric /// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half 276206c3fb27SDimitry Andric /// of \a __a and \a __b as input; other bits in these parameters are 276306c3fb27SDimitry Andric /// ignored. 276406c3fb27SDimitry Andric /// 276506c3fb27SDimitry Andric /// \code{.operation} 276606c3fb27SDimitry Andric /// result[31:0] := __a[95:64] 276706c3fb27SDimitry Andric /// result[63:32] := __b[95:64] 276806c3fb27SDimitry Andric /// result[95:64] := __a[127:96] 276906c3fb27SDimitry Andric /// result[127:96] := __b[127:96] 277006c3fb27SDimitry Andric /// result[159:128] := __a[223:192] 277106c3fb27SDimitry Andric /// result[191:160] := __b[223:192] 277206c3fb27SDimitry Andric /// result[223:192] := __a[255:224] 277306c3fb27SDimitry Andric /// result[255:224] := __b[255:224] 277406c3fb27SDimitry Andric /// \endcode 277506c3fb27SDimitry Andric /// 277606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 277706c3fb27SDimitry Andric /// 277806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction. 277906c3fb27SDimitry Andric /// 278006c3fb27SDimitry Andric /// \param __a 278106c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 278206c3fb27SDimitry Andric /// elements of the result. 278306c3fb27SDimitry Andric /// \param __b 278406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 278506c3fb27SDimitry Andric /// elements of the result. 278606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 27870b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 27880b57cec5SDimitry Andric _mm256_unpackhi_epi32(__m256i __a, __m256i __b) 27890b57cec5SDimitry Andric { 27900b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 27910b57cec5SDimitry Andric } 27920b57cec5SDimitry Andric 279306c3fb27SDimitry Andric /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 279406c3fb27SDimitry Andric /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 279506c3fb27SDimitry Andric /// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half 279606c3fb27SDimitry Andric /// of \a __a and \a __b as input; other bits in these parameters are 279706c3fb27SDimitry Andric /// ignored. 279806c3fb27SDimitry Andric /// 279906c3fb27SDimitry Andric /// \code{.operation} 280006c3fb27SDimitry Andric /// result[63:0] := __a[127:64] 280106c3fb27SDimitry Andric /// result[127:64] := __b[127:64] 280206c3fb27SDimitry Andric /// result[191:128] := __a[255:192] 280306c3fb27SDimitry Andric /// result[255:192] := __b[255:192] 280406c3fb27SDimitry Andric /// \endcode 280506c3fb27SDimitry Andric /// 280606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 280706c3fb27SDimitry Andric /// 280806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction. 280906c3fb27SDimitry Andric /// 281006c3fb27SDimitry Andric /// \param __a 281106c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 281206c3fb27SDimitry Andric /// elements of the result. 281306c3fb27SDimitry Andric /// \param __b 281406c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 281506c3fb27SDimitry Andric /// elements of the result. 281606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 28170b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 28180b57cec5SDimitry Andric _mm256_unpackhi_epi64(__m256i __a, __m256i __b) 28190b57cec5SDimitry Andric { 28200b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); 28210b57cec5SDimitry Andric } 28220b57cec5SDimitry Andric 282306c3fb27SDimitry Andric /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer 282406c3fb27SDimitry Andric /// vectors in \a __a and \a __b to form the 256-bit result. Specifically, 282506c3fb27SDimitry Andric /// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as 282606c3fb27SDimitry Andric /// input; other bits in these parameters are ignored. 282706c3fb27SDimitry Andric /// 282806c3fb27SDimitry Andric /// \code{.operation} 282906c3fb27SDimitry Andric /// result[7:0] := __a[7:0] 283006c3fb27SDimitry Andric /// result[15:8] := __b[7:0] 283106c3fb27SDimitry Andric /// result[23:16] := __a[15:8] 283206c3fb27SDimitry Andric /// result[31:24] := __b[15:8] 283306c3fb27SDimitry Andric /// . . . 283406c3fb27SDimitry Andric /// result[127:120] := __b[63:56] 283506c3fb27SDimitry Andric /// result[135:128] := __a[135:128] 283606c3fb27SDimitry Andric /// . . . 283706c3fb27SDimitry Andric /// result[255:248] := __b[191:184] 283806c3fb27SDimitry Andric /// \endcode 283906c3fb27SDimitry Andric /// 284006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 284106c3fb27SDimitry Andric /// 284206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLBW instruction. 284306c3fb27SDimitry Andric /// 284406c3fb27SDimitry Andric /// \param __a 284506c3fb27SDimitry Andric /// A 256-bit integer vector used as the source for the even-numbered bytes 284606c3fb27SDimitry Andric /// of the result. 284706c3fb27SDimitry Andric /// \param __b 284806c3fb27SDimitry Andric /// A 256-bit integer vector used as the source for the odd-numbered bytes 284906c3fb27SDimitry Andric /// of the result. 285006c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 28510b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 28520b57cec5SDimitry Andric _mm256_unpacklo_epi8(__m256i __a, __m256i __b) 28530b57cec5SDimitry Andric { 28540b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 28550b57cec5SDimitry Andric } 28560b57cec5SDimitry Andric 285706c3fb27SDimitry Andric /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors 285806c3fb27SDimitry Andric /// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit 285906c3fb27SDimitry Andric /// vector of [16 x i16]. Specifically, uses the lower 64 bits of each 286006c3fb27SDimitry Andric /// 128-bit half of \a __a and \a __b as input; other bits in these 286106c3fb27SDimitry Andric /// parameters are ignored. 286206c3fb27SDimitry Andric /// 286306c3fb27SDimitry Andric /// \code{.operation} 286406c3fb27SDimitry Andric /// result[15:0] := __a[15:0] 286506c3fb27SDimitry Andric /// result[31:16] := __b[15:0] 286606c3fb27SDimitry Andric /// result[47:32] := __a[31:16] 286706c3fb27SDimitry Andric /// result[63:48] := __b[31:16] 286806c3fb27SDimitry Andric /// . . . 286906c3fb27SDimitry Andric /// result[127:112] := __b[63:48] 287006c3fb27SDimitry Andric /// result[143:128] := __a[143:128] 287106c3fb27SDimitry Andric /// . . . 287206c3fb27SDimitry Andric /// result[255:239] := __b[191:176] 287306c3fb27SDimitry Andric /// \endcode 287406c3fb27SDimitry Andric /// 287506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 287606c3fb27SDimitry Andric /// 287706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLWD instruction. 287806c3fb27SDimitry Andric /// 287906c3fb27SDimitry Andric /// \param __a 288006c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used as the source for the even-numbered 288106c3fb27SDimitry Andric /// elements of the result. 288206c3fb27SDimitry Andric /// \param __b 288306c3fb27SDimitry Andric /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered 288406c3fb27SDimitry Andric /// elements of the result. 288506c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 28860b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 28870b57cec5SDimitry Andric _mm256_unpacklo_epi16(__m256i __a, __m256i __b) 28880b57cec5SDimitry Andric { 28890b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 28900b57cec5SDimitry Andric } 28910b57cec5SDimitry Andric 289206c3fb27SDimitry Andric /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors 289306c3fb27SDimitry Andric /// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector 289406c3fb27SDimitry Andric /// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half 289506c3fb27SDimitry Andric /// of \a __a and \a __b as input; other bits in these parameters are 289606c3fb27SDimitry Andric /// ignored. 289706c3fb27SDimitry Andric /// 289806c3fb27SDimitry Andric /// \code{.operation} 289906c3fb27SDimitry Andric /// result[31:0] := __a[31:0] 290006c3fb27SDimitry Andric /// result[63:32] := __b[31:0] 290106c3fb27SDimitry Andric /// result[95:64] := __a[63:32] 290206c3fb27SDimitry Andric /// result[127:96] := __b[63:32] 290306c3fb27SDimitry Andric /// result[159:128] := __a[159:128] 290406c3fb27SDimitry Andric /// result[191:160] := __b[159:128] 290506c3fb27SDimitry Andric /// result[223:192] := __a[191:160] 290606c3fb27SDimitry Andric /// result[255:224] := __b[191:190] 290706c3fb27SDimitry Andric /// \endcode 290806c3fb27SDimitry Andric /// 290906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 291006c3fb27SDimitry Andric /// 291106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction. 291206c3fb27SDimitry Andric /// 291306c3fb27SDimitry Andric /// \param __a 291406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used as the source for the even-numbered 291506c3fb27SDimitry Andric /// elements of the result. 291606c3fb27SDimitry Andric /// \param __b 291706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered 291806c3fb27SDimitry Andric /// elements of the result. 291906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 29200b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 29210b57cec5SDimitry Andric _mm256_unpacklo_epi32(__m256i __a, __m256i __b) 29220b57cec5SDimitry Andric { 29230b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 29240b57cec5SDimitry Andric } 29250b57cec5SDimitry Andric 292606c3fb27SDimitry Andric /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors 292706c3fb27SDimitry Andric /// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector 292806c3fb27SDimitry Andric /// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half 292906c3fb27SDimitry Andric /// of \a __a and \a __b as input; other bits in these parameters are 293006c3fb27SDimitry Andric /// ignored. 293106c3fb27SDimitry Andric /// 293206c3fb27SDimitry Andric /// \code{.operation} 293306c3fb27SDimitry Andric /// result[63:0] := __a[63:0] 293406c3fb27SDimitry Andric /// result[127:64] := __b[63:0] 293506c3fb27SDimitry Andric /// result[191:128] := __a[191:128] 293606c3fb27SDimitry Andric /// result[255:192] := __b[191:128] 293706c3fb27SDimitry Andric /// \endcode 293806c3fb27SDimitry Andric /// 293906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 294006c3fb27SDimitry Andric /// 294106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction. 294206c3fb27SDimitry Andric /// 294306c3fb27SDimitry Andric /// \param __a 294406c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] used as the source for the even-numbered 294506c3fb27SDimitry Andric /// elements of the result. 294606c3fb27SDimitry Andric /// \param __b 294706c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered 294806c3fb27SDimitry Andric /// elements of the result. 294906c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 29500b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 29510b57cec5SDimitry Andric _mm256_unpacklo_epi64(__m256i __a, __m256i __b) 29520b57cec5SDimitry Andric { 29530b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); 29540b57cec5SDimitry Andric } 29550b57cec5SDimitry Andric 295606c3fb27SDimitry Andric /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and 295706c3fb27SDimitry Andric /// \a __b. 295806c3fb27SDimitry Andric /// 295906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 296006c3fb27SDimitry Andric /// 296106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPXOR instruction. 296206c3fb27SDimitry Andric /// 296306c3fb27SDimitry Andric /// \param __a 296406c3fb27SDimitry Andric /// A 256-bit integer vector. 296506c3fb27SDimitry Andric /// \param __b 296606c3fb27SDimitry Andric /// A 256-bit integer vector. 296706c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 29680b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 29690b57cec5SDimitry Andric _mm256_xor_si256(__m256i __a, __m256i __b) 29700b57cec5SDimitry Andric { 29710b57cec5SDimitry Andric return (__m256i)((__v4du)__a ^ (__v4du)__b); 29720b57cec5SDimitry Andric } 29730b57cec5SDimitry Andric 297406c3fb27SDimitry Andric /// Loads the 256-bit integer vector from memory \a __V using a non-temporal 297506c3fb27SDimitry Andric /// memory hint and returns the vector. \a __V must be aligned on a 32-byte 297606c3fb27SDimitry Andric /// boundary. 297706c3fb27SDimitry Andric /// 297806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 297906c3fb27SDimitry Andric /// 298006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VMOVNTDQA instruction. 298106c3fb27SDimitry Andric /// 298206c3fb27SDimitry Andric /// \param __V 298306c3fb27SDimitry Andric /// A pointer to the 32-byte aligned memory containing the vector to load. 298406c3fb27SDimitry Andric /// \returns A 256-bit integer vector loaded from memory. 29850b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2986*5f757f3fSDimitry Andric _mm256_stream_load_si256(const void *__V) 29870b57cec5SDimitry Andric { 29880b57cec5SDimitry Andric typedef __v4di __v4di_aligned __attribute__((aligned(32))); 29890b57cec5SDimitry Andric return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); 29900b57cec5SDimitry Andric } 29910b57cec5SDimitry Andric 299206c3fb27SDimitry Andric /// Broadcasts the 32-bit floating-point value from the low element of the 299306c3fb27SDimitry Andric /// 128-bit vector of [4 x float] in \a __X to all elements of the result's 299406c3fb27SDimitry Andric /// 128-bit vector of [4 x float]. 299506c3fb27SDimitry Andric /// 299606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 299706c3fb27SDimitry Andric /// 299806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 299906c3fb27SDimitry Andric /// 300006c3fb27SDimitry Andric /// \param __X 300106c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] whose low element will be broadcast. 300206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result. 30030b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 30040b57cec5SDimitry Andric _mm_broadcastss_ps(__m128 __X) 30050b57cec5SDimitry Andric { 30060b57cec5SDimitry Andric return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); 30070b57cec5SDimitry Andric } 30080b57cec5SDimitry Andric 300906c3fb27SDimitry Andric /// Broadcasts the 64-bit floating-point value from the low element of the 301006c3fb27SDimitry Andric /// 128-bit vector of [2 x double] in \a __a to both elements of the 301106c3fb27SDimitry Andric /// result's 128-bit vector of [2 x double]. 301206c3fb27SDimitry Andric /// 301306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 301406c3fb27SDimitry Andric /// 301506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c MOVDDUP instruction. 301606c3fb27SDimitry Andric /// 301706c3fb27SDimitry Andric /// \param __a 301806c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] whose low element will be broadcast. 301906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result. 30200b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 30210b57cec5SDimitry Andric _mm_broadcastsd_pd(__m128d __a) 30220b57cec5SDimitry Andric { 30230b57cec5SDimitry Andric return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 30240b57cec5SDimitry Andric } 30250b57cec5SDimitry Andric 302606c3fb27SDimitry Andric /// Broadcasts the 32-bit floating-point value from the low element of the 302706c3fb27SDimitry Andric /// 128-bit vector of [4 x float] in \a __X to all elements of the 302806c3fb27SDimitry Andric /// result's 256-bit vector of [8 x float]. 302906c3fb27SDimitry Andric /// 303006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 303106c3fb27SDimitry Andric /// 303206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTSS instruction. 303306c3fb27SDimitry Andric /// 303406c3fb27SDimitry Andric /// \param __X 303506c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] whose low element will be broadcast. 303606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 30370b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 30380b57cec5SDimitry Andric _mm256_broadcastss_ps(__m128 __X) 30390b57cec5SDimitry Andric { 30400b57cec5SDimitry Andric return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); 30410b57cec5SDimitry Andric } 30420b57cec5SDimitry Andric 304306c3fb27SDimitry Andric /// Broadcasts the 64-bit floating-point value from the low element of the 304406c3fb27SDimitry Andric /// 128-bit vector of [2 x double] in \a __X to all elements of the 304506c3fb27SDimitry Andric /// result's 256-bit vector of [4 x double]. 304606c3fb27SDimitry Andric /// 304706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 304806c3fb27SDimitry Andric /// 304906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTSD instruction. 305006c3fb27SDimitry Andric /// 305106c3fb27SDimitry Andric /// \param __X 305206c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] whose low element will be broadcast. 305306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 30540b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 30550b57cec5SDimitry Andric _mm256_broadcastsd_pd(__m128d __X) 30560b57cec5SDimitry Andric { 30570b57cec5SDimitry Andric return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); 30580b57cec5SDimitry Andric } 30590b57cec5SDimitry Andric 306006c3fb27SDimitry Andric /// Broadcasts the 128-bit integer data from \a __X to both the lower and 306106c3fb27SDimitry Andric /// upper halves of the 256-bit result. 306206c3fb27SDimitry Andric /// 306306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 306406c3fb27SDimitry Andric /// 306506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VBROADCASTI128 instruction. 306606c3fb27SDimitry Andric /// 306706c3fb27SDimitry Andric /// \param __X 306806c3fb27SDimitry Andric /// A 128-bit integer vector to be broadcast. 306906c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 30700b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 30710b57cec5SDimitry Andric _mm256_broadcastsi128_si256(__m128i __X) 30720b57cec5SDimitry Andric { 30730b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); 30740b57cec5SDimitry Andric } 30750b57cec5SDimitry Andric 30765ffd83dbSDimitry Andric #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) 30775ffd83dbSDimitry Andric 307806c3fb27SDimitry Andric /// Merges 32-bit integer elements from either of the two 128-bit vectors of 307906c3fb27SDimitry Andric /// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32], 308006c3fb27SDimitry Andric /// as specified by the immediate integer operand \a M. 308106c3fb27SDimitry Andric /// 308206c3fb27SDimitry Andric /// \code{.operation} 308306c3fb27SDimitry Andric /// FOR i := 0 TO 3 308406c3fb27SDimitry Andric /// j := i*32 308506c3fb27SDimitry Andric /// IF M[i] == 0 308606c3fb27SDimitry Andric /// result[31+j:j] := V1[31+j:j] 308706c3fb27SDimitry Andric /// ELSE 308806c3fb27SDimitry Andric /// result[31+j:j] := V2[32+j:j] 308906c3fb27SDimitry Andric /// FI 309006c3fb27SDimitry Andric /// ENDFOR 309106c3fb27SDimitry Andric /// \endcode 309206c3fb27SDimitry Andric /// 309306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 309406c3fb27SDimitry Andric /// 309506c3fb27SDimitry Andric /// \code 309606c3fb27SDimitry Andric /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M); 309706c3fb27SDimitry Andric /// \endcode 309806c3fb27SDimitry Andric /// 309906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDDD instruction. 310006c3fb27SDimitry Andric /// 310106c3fb27SDimitry Andric /// \param V1 310206c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing source values. 310306c3fb27SDimitry Andric /// \param V2 310406c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing source values. 310506c3fb27SDimitry Andric /// \param M 310606c3fb27SDimitry Andric /// An immediate 8-bit integer operand, with bits [3:0] specifying the 310706c3fb27SDimitry Andric /// source for each element of the result. The position of the mask bit 310806c3fb27SDimitry Andric /// corresponds to the index of a copied value. When a mask bit is 0, the 310906c3fb27SDimitry Andric /// element is copied from \a V1; otherwise, it is copied from \a V2. 311006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result. 31110b57cec5SDimitry Andric #define _mm_blend_epi32(V1, V2, M) \ 3112349cc55cSDimitry Andric ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ 3113349cc55cSDimitry Andric (__v4si)(__m128i)(V2), (int)(M))) 31140b57cec5SDimitry Andric 311506c3fb27SDimitry Andric /// Merges 32-bit integer elements from either of the two 256-bit vectors of 311606c3fb27SDimitry Andric /// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32], 311706c3fb27SDimitry Andric /// as specified by the immediate integer operand \a M. 311806c3fb27SDimitry Andric /// 311906c3fb27SDimitry Andric /// \code{.operation} 312006c3fb27SDimitry Andric /// FOR i := 0 TO 7 312106c3fb27SDimitry Andric /// j := i*32 312206c3fb27SDimitry Andric /// IF M[i] == 0 312306c3fb27SDimitry Andric /// result[31+j:j] := V1[31+j:j] 312406c3fb27SDimitry Andric /// ELSE 312506c3fb27SDimitry Andric /// result[31+j:j] := V2[32+j:j] 312606c3fb27SDimitry Andric /// FI 312706c3fb27SDimitry Andric /// ENDFOR 312806c3fb27SDimitry Andric /// \endcode 312906c3fb27SDimitry Andric /// 313006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 313106c3fb27SDimitry Andric /// 313206c3fb27SDimitry Andric /// \code 313306c3fb27SDimitry Andric /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M); 313406c3fb27SDimitry Andric /// \endcode 313506c3fb27SDimitry Andric /// 313606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBLENDDD instruction. 313706c3fb27SDimitry Andric /// 313806c3fb27SDimitry Andric /// \param V1 313906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing source values. 314006c3fb27SDimitry Andric /// \param V2 314106c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing source values. 314206c3fb27SDimitry Andric /// \param M 314306c3fb27SDimitry Andric /// An immediate 8-bit integer operand, with bits [7:0] specifying the 314406c3fb27SDimitry Andric /// source for each element of the result. The position of the mask bit 314506c3fb27SDimitry Andric /// corresponds to the index of a copied value. When a mask bit is 0, the 314606c3fb27SDimitry Andric /// element is copied from \a V1; otherwise, it is is copied from \a V2. 314706c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 31480b57cec5SDimitry Andric #define _mm256_blend_epi32(V1, V2, M) \ 3149349cc55cSDimitry Andric ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ 3150349cc55cSDimitry Andric (__v8si)(__m256i)(V2), (int)(M))) 31510b57cec5SDimitry Andric 315206c3fb27SDimitry Andric /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 315306c3fb27SDimitry Andric /// bytes of the 256-bit result. 315406c3fb27SDimitry Andric /// 315506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 315606c3fb27SDimitry Andric /// 315706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 315806c3fb27SDimitry Andric /// 315906c3fb27SDimitry Andric /// \param __X 316006c3fb27SDimitry Andric /// A 128-bit integer vector whose low byte will be broadcast. 316106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 31620b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 31630b57cec5SDimitry Andric _mm256_broadcastb_epi8(__m128i __X) 31640b57cec5SDimitry Andric { 31650b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 31660b57cec5SDimitry Andric } 31670b57cec5SDimitry Andric 316806c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X 316906c3fb27SDimitry Andric /// to all elements of the result's 256-bit vector of [16 x i16]. 317006c3fb27SDimitry Andric /// 317106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 317206c3fb27SDimitry Andric /// 317306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 317406c3fb27SDimitry Andric /// 317506c3fb27SDimitry Andric /// \param __X 317606c3fb27SDimitry Andric /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 317706c3fb27SDimitry Andric /// \returns A 256-bit vector of [16 x i16] containing the result. 31780b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 31790b57cec5SDimitry Andric _mm256_broadcastw_epi16(__m128i __X) 31800b57cec5SDimitry Andric { 31810b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 31820b57cec5SDimitry Andric } 31830b57cec5SDimitry Andric 318406c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 318506c3fb27SDimitry Andric /// to all elements of the result's 256-bit vector of [8 x i32]. 318606c3fb27SDimitry Andric /// 318706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 318806c3fb27SDimitry Andric /// 318906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 319006c3fb27SDimitry Andric /// 319106c3fb27SDimitry Andric /// \param __X 319206c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 319306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 31940b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 31950b57cec5SDimitry Andric _mm256_broadcastd_epi32(__m128i __X) 31960b57cec5SDimitry Andric { 31970b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); 31980b57cec5SDimitry Andric } 31990b57cec5SDimitry Andric 320006c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 320106c3fb27SDimitry Andric /// to all elements of the result's 256-bit vector of [4 x i64]. 320206c3fb27SDimitry Andric /// 320306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 320406c3fb27SDimitry Andric /// 320506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 320606c3fb27SDimitry Andric /// 320706c3fb27SDimitry Andric /// \param __X 320806c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 320906c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 32100b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 32110b57cec5SDimitry Andric _mm256_broadcastq_epi64(__m128i __X) 32120b57cec5SDimitry Andric { 32130b57cec5SDimitry Andric return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); 32140b57cec5SDimitry Andric } 32150b57cec5SDimitry Andric 321606c3fb27SDimitry Andric /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all 321706c3fb27SDimitry Andric /// bytes of the 128-bit result. 321806c3fb27SDimitry Andric /// 321906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 322006c3fb27SDimitry Andric /// 322106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTB instruction. 322206c3fb27SDimitry Andric /// 322306c3fb27SDimitry Andric /// \param __X 322406c3fb27SDimitry Andric /// A 128-bit integer vector whose low byte will be broadcast. 322506c3fb27SDimitry Andric /// \returns A 128-bit integer vector containing the result. 32260b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 32270b57cec5SDimitry Andric _mm_broadcastb_epi8(__m128i __X) 32280b57cec5SDimitry Andric { 32290b57cec5SDimitry Andric return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 32300b57cec5SDimitry Andric } 32310b57cec5SDimitry Andric 323206c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [8 x i16] in 323306c3fb27SDimitry Andric /// \a __X to all elements of the result's 128-bit vector of [8 x i16]. 323406c3fb27SDimitry Andric /// 323506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 323606c3fb27SDimitry Andric /// 323706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTW instruction. 323806c3fb27SDimitry Andric /// 323906c3fb27SDimitry Andric /// \param __X 324006c3fb27SDimitry Andric /// A 128-bit vector of [8 x i16] whose low element will be broadcast. 324106c3fb27SDimitry Andric /// \returns A 128-bit vector of [8 x i16] containing the result. 32420b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 32430b57cec5SDimitry Andric _mm_broadcastw_epi16(__m128i __X) 32440b57cec5SDimitry Andric { 32450b57cec5SDimitry Andric return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); 32460b57cec5SDimitry Andric } 32470b57cec5SDimitry Andric 324806c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X 324906c3fb27SDimitry Andric /// to all elements of the result's vector of [4 x i32]. 325006c3fb27SDimitry Andric /// 325106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 325206c3fb27SDimitry Andric /// 325306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTD instruction. 325406c3fb27SDimitry Andric /// 325506c3fb27SDimitry Andric /// \param __X 325606c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] whose low element will be broadcast. 325706c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result. 32580b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 32590b57cec5SDimitry Andric _mm_broadcastd_epi32(__m128i __X) 32600b57cec5SDimitry Andric { 32610b57cec5SDimitry Andric return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); 32620b57cec5SDimitry Andric } 32630b57cec5SDimitry Andric 326406c3fb27SDimitry Andric /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X 326506c3fb27SDimitry Andric /// to both elements of the result's 128-bit vector of [2 x i64]. 326606c3fb27SDimitry Andric /// 326706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 326806c3fb27SDimitry Andric /// 326906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPBROADCASTQ instruction. 327006c3fb27SDimitry Andric /// 327106c3fb27SDimitry Andric /// \param __X 327206c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] whose low element will be broadcast. 327306c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the result. 32740b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 32750b57cec5SDimitry Andric _mm_broadcastq_epi64(__m128i __X) 32760b57cec5SDimitry Andric { 32770b57cec5SDimitry Andric return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); 32780b57cec5SDimitry Andric } 32790b57cec5SDimitry Andric 328006c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 328106c3fb27SDimitry Andric /// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the 328206c3fb27SDimitry Andric /// elements of the 256-bit vector of [8 x i32] in \a __b. 328306c3fb27SDimitry Andric /// 328406c3fb27SDimitry Andric /// \code{.operation} 328506c3fb27SDimitry Andric /// FOR i := 0 TO 7 328606c3fb27SDimitry Andric /// j := i*32 328706c3fb27SDimitry Andric /// k := __b[j+2:j] * 32 328806c3fb27SDimitry Andric /// result[j+31:j] := __a[k+31:k] 328906c3fb27SDimitry Andric /// ENDFOR 329006c3fb27SDimitry Andric /// \endcode 329106c3fb27SDimitry Andric /// 329206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 329306c3fb27SDimitry Andric /// 329406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMD instruction. 329506c3fb27SDimitry Andric /// 329606c3fb27SDimitry Andric /// \param __a 329706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the source values. 329806c3fb27SDimitry Andric /// \param __b 329906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing indexes of values to use from 330006c3fb27SDimitry Andric /// \a __a. 330106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 33020b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 33030b57cec5SDimitry Andric _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 33040b57cec5SDimitry Andric { 33050b57cec5SDimitry Andric return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 33060b57cec5SDimitry Andric } 33070b57cec5SDimitry Andric 330806c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [4 x double] to copies of elements of 330906c3fb27SDimitry Andric /// the 256-bit vector of [4 x double] in \a V as specified by the 331006c3fb27SDimitry Andric /// immediate value \a M. 331106c3fb27SDimitry Andric /// 331206c3fb27SDimitry Andric /// \code{.operation} 331306c3fb27SDimitry Andric /// FOR i := 0 TO 3 331406c3fb27SDimitry Andric /// j := i*64 331506c3fb27SDimitry Andric /// k := (M >> i*2)[1:0] * 64 331606c3fb27SDimitry Andric /// result[j+63:j] := V[k+63:k] 331706c3fb27SDimitry Andric /// ENDFOR 331806c3fb27SDimitry Andric /// \endcode 331906c3fb27SDimitry Andric /// 332006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 332106c3fb27SDimitry Andric /// 332206c3fb27SDimitry Andric /// \code 332306c3fb27SDimitry Andric /// __m256d _mm256_permute4x64_pd(__m256d V, const int M); 332406c3fb27SDimitry Andric /// \endcode 332506c3fb27SDimitry Andric /// 332606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMPD instruction. 332706c3fb27SDimitry Andric /// 332806c3fb27SDimitry Andric /// \param V 332906c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the source values. 333006c3fb27SDimitry Andric /// \param M 333106c3fb27SDimitry Andric /// An immediate 8-bit value specifying which elements to copy from \a V. 333206c3fb27SDimitry Andric /// \a M[1:0] specifies the index in \a a for element 0 of the result, 333306c3fb27SDimitry Andric /// \a M[3:2] specifies the index for element 1, and so forth. 333406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 33350b57cec5SDimitry Andric #define _mm256_permute4x64_pd(V, M) \ 3336349cc55cSDimitry Andric ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) 33370b57cec5SDimitry Andric 333806c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [8 x float] to copies of elements of 333906c3fb27SDimitry Andric /// the 256-bit vector of [8 x float] in \a __a as specified by indexes in 334006c3fb27SDimitry Andric /// the elements of the 256-bit vector of [8 x i32] in \a __b. 334106c3fb27SDimitry Andric /// 334206c3fb27SDimitry Andric /// \code{.operation} 334306c3fb27SDimitry Andric /// FOR i := 0 TO 7 334406c3fb27SDimitry Andric /// j := i*32 334506c3fb27SDimitry Andric /// k := __b[j+2:j] * 32 334606c3fb27SDimitry Andric /// result[j+31:j] := __a[k+31:k] 334706c3fb27SDimitry Andric /// ENDFOR 334806c3fb27SDimitry Andric /// \endcode 334906c3fb27SDimitry Andric /// 335006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 335106c3fb27SDimitry Andric /// 335206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMPS instruction. 335306c3fb27SDimitry Andric /// 335406c3fb27SDimitry Andric /// \param __a 335506c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the source values. 335606c3fb27SDimitry Andric /// \param __b 335706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing indexes of values to use from 335806c3fb27SDimitry Andric /// \a __a. 335906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 33600b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 33610b57cec5SDimitry Andric _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) 33620b57cec5SDimitry Andric { 33630b57cec5SDimitry Andric return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); 33640b57cec5SDimitry Andric } 33650b57cec5SDimitry Andric 336606c3fb27SDimitry Andric /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements 336706c3fb27SDimitry Andric /// of the 256-bit vector of [4 x i64] in \a V as specified by the 336806c3fb27SDimitry Andric /// immediate value \a M. 336906c3fb27SDimitry Andric /// 337006c3fb27SDimitry Andric /// \code{.operation} 337106c3fb27SDimitry Andric /// FOR i := 0 TO 3 337206c3fb27SDimitry Andric /// j := i*64 337306c3fb27SDimitry Andric /// k := (M >> i*2)[1:0] * 64 337406c3fb27SDimitry Andric /// result[j+63:j] := V[k+63:k] 337506c3fb27SDimitry Andric /// ENDFOR 337606c3fb27SDimitry Andric /// \endcode 337706c3fb27SDimitry Andric /// 337806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 337906c3fb27SDimitry Andric /// 338006c3fb27SDimitry Andric /// \code 338106c3fb27SDimitry Andric /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M); 338206c3fb27SDimitry Andric /// \endcode 338306c3fb27SDimitry Andric /// 338406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERMQ instruction. 338506c3fb27SDimitry Andric /// 338606c3fb27SDimitry Andric /// \param V 338706c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the source values. 338806c3fb27SDimitry Andric /// \param M 338906c3fb27SDimitry Andric /// An immediate 8-bit value specifying which elements to copy from \a V. 339006c3fb27SDimitry Andric /// \a M[1:0] specifies the index in \a a for element 0 of the result, 339106c3fb27SDimitry Andric /// \a M[3:2] specifies the index for element 1, and so forth. 339206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 33930b57cec5SDimitry Andric #define _mm256_permute4x64_epi64(V, M) \ 3394349cc55cSDimitry Andric ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) 33950b57cec5SDimitry Andric 339606c3fb27SDimitry Andric /// Sets each half of the 256-bit result either to zero or to one of the 339706c3fb27SDimitry Andric /// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2, 339806c3fb27SDimitry Andric /// as specified by the immediate value \a M. 339906c3fb27SDimitry Andric /// 340006c3fb27SDimitry Andric /// \code{.operation} 340106c3fb27SDimitry Andric /// FOR i := 0 TO 1 340206c3fb27SDimitry Andric /// j := i*128 340306c3fb27SDimitry Andric /// k := M >> (i*4) 340406c3fb27SDimitry Andric /// IF k[3] == 0 340506c3fb27SDimitry Andric /// CASE (k[1:0]) OF 340606c3fb27SDimitry Andric /// 0: result[127+j:j] := V1[127:0] 340706c3fb27SDimitry Andric /// 1: result[127+j:j] := V1[255:128] 340806c3fb27SDimitry Andric /// 2: result[127+j:j] := V2[127:0] 340906c3fb27SDimitry Andric /// 3: result[127+j:j] := V2[255:128] 341006c3fb27SDimitry Andric /// ESAC 341106c3fb27SDimitry Andric /// ELSE 341206c3fb27SDimitry Andric /// result[127+j:j] := 0 341306c3fb27SDimitry Andric /// FI 341406c3fb27SDimitry Andric /// ENDFOR 341506c3fb27SDimitry Andric /// \endcode 341606c3fb27SDimitry Andric /// 341706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 341806c3fb27SDimitry Andric /// 341906c3fb27SDimitry Andric /// \code 342006c3fb27SDimitry Andric /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M); 342106c3fb27SDimitry Andric /// \endcode 342206c3fb27SDimitry Andric /// 342306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPERM2I128 instruction. 342406c3fb27SDimitry Andric /// 342506c3fb27SDimitry Andric /// \param V1 342606c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 342706c3fb27SDimitry Andric /// \param V2 342806c3fb27SDimitry Andric /// A 256-bit integer vector containing source values. 342906c3fb27SDimitry Andric /// \param M 343006c3fb27SDimitry Andric /// An immediate value specifying how to form the result. Bits [3:0] 343106c3fb27SDimitry Andric /// control the lower half of the result, bits [7:4] control the upper half. 343206c3fb27SDimitry Andric /// Within each 4-bit control value, if bit 3 is 1, the result is zero, 343306c3fb27SDimitry Andric /// otherwise bits [1:0] determine the source as follows. \n 343406c3fb27SDimitry Andric /// 0: the lower half of \a V1 \n 343506c3fb27SDimitry Andric /// 1: the upper half of \a V1 \n 343606c3fb27SDimitry Andric /// 2: the lower half of \a V2 \n 343706c3fb27SDimitry Andric /// 3: the upper half of \a V2 343806c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 34390b57cec5SDimitry Andric #define _mm256_permute2x128_si256(V1, V2, M) \ 3440349cc55cSDimitry Andric ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) 34410b57cec5SDimitry Andric 344206c3fb27SDimitry Andric /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0 344306c3fb27SDimitry Andric /// of the immediate \a M is zero, extracts the lower half of the result; 344406c3fb27SDimitry Andric /// otherwise, extracts the upper half. 344506c3fb27SDimitry Andric /// 344606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 344706c3fb27SDimitry Andric /// 344806c3fb27SDimitry Andric /// \code 344906c3fb27SDimitry Andric /// __m128i _mm256_extracti128_si256(__m256i V, const int M); 345006c3fb27SDimitry Andric /// \endcode 345106c3fb27SDimitry Andric /// 345206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VEXTRACTI128 instruction. 345306c3fb27SDimitry Andric /// 345406c3fb27SDimitry Andric /// \param V 345506c3fb27SDimitry Andric /// A 256-bit integer vector containing the source values. 345606c3fb27SDimitry Andric /// \param M 345706c3fb27SDimitry Andric /// An immediate value specifying which half of \a V to extract. 345806c3fb27SDimitry Andric /// \returns A 128-bit integer vector containing the result. 34590b57cec5SDimitry Andric #define _mm256_extracti128_si256(V, M) \ 3460349cc55cSDimitry Andric ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) 34610b57cec5SDimitry Andric 346206c3fb27SDimitry Andric /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the 346306c3fb27SDimitry Andric /// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M 346406c3fb27SDimitry Andric /// is zero, overwrites the lower half of the result; otherwise, 346506c3fb27SDimitry Andric /// overwrites the upper half. 346606c3fb27SDimitry Andric /// 346706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 346806c3fb27SDimitry Andric /// 346906c3fb27SDimitry Andric /// \code 347006c3fb27SDimitry Andric /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M); 347106c3fb27SDimitry Andric /// \endcode 347206c3fb27SDimitry Andric /// 347306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VINSERTI128 instruction. 347406c3fb27SDimitry Andric /// 347506c3fb27SDimitry Andric /// \param V1 347606c3fb27SDimitry Andric /// A 256-bit integer vector containing a source value. 347706c3fb27SDimitry Andric /// \param V2 347806c3fb27SDimitry Andric /// A 128-bit integer vector containing a source value. 347906c3fb27SDimitry Andric /// \param M 348006c3fb27SDimitry Andric /// An immediate value specifying where to put \a V2 in the result. 348106c3fb27SDimitry Andric /// \returns A 256-bit integer vector containing the result. 34820b57cec5SDimitry Andric #define _mm256_inserti128_si256(V1, V2, M) \ 3483349cc55cSDimitry Andric ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ 3484349cc55cSDimitry Andric (__v2di)(__m128i)(V2), (int)(M))) 34850b57cec5SDimitry Andric 348606c3fb27SDimitry Andric /// Conditionally loads eight 32-bit integer elements from memory \a __X, if 348706c3fb27SDimitry Andric /// the most significant bit of the corresponding element in the mask 348806c3fb27SDimitry Andric /// \a __M is set; otherwise, sets that element of the result to zero. 348906c3fb27SDimitry Andric /// Returns the 256-bit [8 x i32] result. 349006c3fb27SDimitry Andric /// 349106c3fb27SDimitry Andric /// \code{.operation} 349206c3fb27SDimitry Andric /// FOR i := 0 TO 7 349306c3fb27SDimitry Andric /// j := i*32 349406c3fb27SDimitry Andric /// IF __M[j+31] == 1 349506c3fb27SDimitry Andric /// result[j+31:j] := Load32(__X+(i*4)) 349606c3fb27SDimitry Andric /// ELSE 349706c3fb27SDimitry Andric /// result[j+31:j] := 0 349806c3fb27SDimitry Andric /// FI 349906c3fb27SDimitry Andric /// ENDFOR 350006c3fb27SDimitry Andric /// \endcode 350106c3fb27SDimitry Andric /// 350206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 350306c3fb27SDimitry Andric /// 350406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 350506c3fb27SDimitry Andric /// 350606c3fb27SDimitry Andric /// \param __X 350706c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 350806c3fb27SDimitry Andric /// \param __M 350906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the mask bits. 351006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed 351106c3fb27SDimitry Andric /// elements. 35120b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 35130b57cec5SDimitry Andric _mm256_maskload_epi32(int const *__X, __m256i __M) 35140b57cec5SDimitry Andric { 35150b57cec5SDimitry Andric return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 35160b57cec5SDimitry Andric } 35170b57cec5SDimitry Andric 351806c3fb27SDimitry Andric /// Conditionally loads four 64-bit integer elements from memory \a __X, if 351906c3fb27SDimitry Andric /// the most significant bit of the corresponding element in the mask 352006c3fb27SDimitry Andric /// \a __M is set; otherwise, sets that element of the result to zero. 352106c3fb27SDimitry Andric /// Returns the 256-bit [4 x i64] result. 352206c3fb27SDimitry Andric /// 352306c3fb27SDimitry Andric /// \code{.operation} 352406c3fb27SDimitry Andric /// FOR i := 0 TO 3 352506c3fb27SDimitry Andric /// j := i*64 352606c3fb27SDimitry Andric /// IF __M[j+63] == 1 352706c3fb27SDimitry Andric /// result[j+63:j] := Load64(__X+(i*8)) 352806c3fb27SDimitry Andric /// ELSE 352906c3fb27SDimitry Andric /// result[j+63:j] := 0 353006c3fb27SDimitry Andric /// FI 353106c3fb27SDimitry Andric /// ENDFOR 353206c3fb27SDimitry Andric /// \endcode 353306c3fb27SDimitry Andric /// 353406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 353506c3fb27SDimitry Andric /// 353606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 353706c3fb27SDimitry Andric /// 353806c3fb27SDimitry Andric /// \param __X 353906c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 354006c3fb27SDimitry Andric /// \param __M 354106c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the mask bits. 354206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed 354306c3fb27SDimitry Andric /// elements. 35440b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 35450b57cec5SDimitry Andric _mm256_maskload_epi64(long long const *__X, __m256i __M) 35460b57cec5SDimitry Andric { 35470b57cec5SDimitry Andric return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); 35480b57cec5SDimitry Andric } 35490b57cec5SDimitry Andric 355006c3fb27SDimitry Andric /// Conditionally loads four 32-bit integer elements from memory \a __X, if 355106c3fb27SDimitry Andric /// the most significant bit of the corresponding element in the mask 355206c3fb27SDimitry Andric /// \a __M is set; otherwise, sets that element of the result to zero. 355306c3fb27SDimitry Andric /// Returns the 128-bit [4 x i32] result. 355406c3fb27SDimitry Andric /// 355506c3fb27SDimitry Andric /// \code{.operation} 355606c3fb27SDimitry Andric /// FOR i := 0 TO 3 355706c3fb27SDimitry Andric /// j := i*32 355806c3fb27SDimitry Andric /// IF __M[j+31] == 1 355906c3fb27SDimitry Andric /// result[j+31:j] := Load32(__X+(i*4)) 356006c3fb27SDimitry Andric /// ELSE 356106c3fb27SDimitry Andric /// result[j+31:j] := 0 356206c3fb27SDimitry Andric /// FI 356306c3fb27SDimitry Andric /// ENDFOR 356406c3fb27SDimitry Andric /// \endcode 356506c3fb27SDimitry Andric /// 356606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 356706c3fb27SDimitry Andric /// 356806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 356906c3fb27SDimitry Andric /// 357006c3fb27SDimitry Andric /// \param __X 357106c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 357206c3fb27SDimitry Andric /// \param __M 357306c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the mask bits. 357406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed 357506c3fb27SDimitry Andric /// elements. 35760b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 35770b57cec5SDimitry Andric _mm_maskload_epi32(int const *__X, __m128i __M) 35780b57cec5SDimitry Andric { 35790b57cec5SDimitry Andric return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 35800b57cec5SDimitry Andric } 35810b57cec5SDimitry Andric 358206c3fb27SDimitry Andric /// Conditionally loads two 64-bit integer elements from memory \a __X, if 358306c3fb27SDimitry Andric /// the most significant bit of the corresponding element in the mask 358406c3fb27SDimitry Andric /// \a __M is set; otherwise, sets that element of the result to zero. 358506c3fb27SDimitry Andric /// Returns the 128-bit [2 x i64] result. 358606c3fb27SDimitry Andric /// 358706c3fb27SDimitry Andric /// \code{.operation} 358806c3fb27SDimitry Andric /// FOR i := 0 TO 1 358906c3fb27SDimitry Andric /// j := i*64 359006c3fb27SDimitry Andric /// IF __M[j+63] == 1 359106c3fb27SDimitry Andric /// result[j+63:j] := Load64(__X+(i*8)) 359206c3fb27SDimitry Andric /// ELSE 359306c3fb27SDimitry Andric /// result[j+63:j] := 0 359406c3fb27SDimitry Andric /// FI 359506c3fb27SDimitry Andric /// ENDFOR 359606c3fb27SDimitry Andric /// \endcode 359706c3fb27SDimitry Andric /// 359806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 359906c3fb27SDimitry Andric /// 360006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 360106c3fb27SDimitry Andric /// 360206c3fb27SDimitry Andric /// \param __X 360306c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 360406c3fb27SDimitry Andric /// \param __M 360506c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the mask bits. 360606c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed 360706c3fb27SDimitry Andric /// elements. 36080b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 36090b57cec5SDimitry Andric _mm_maskload_epi64(long long const *__X, __m128i __M) 36100b57cec5SDimitry Andric { 36110b57cec5SDimitry Andric return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 36120b57cec5SDimitry Andric } 36130b57cec5SDimitry Andric 361406c3fb27SDimitry Andric /// Conditionally stores eight 32-bit integer elements from the 256-bit vector 361506c3fb27SDimitry Andric /// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of 361606c3fb27SDimitry Andric /// the corresponding element in the mask \a __M is set; otherwise, the 361706c3fb27SDimitry Andric /// memory element is unchanged. 361806c3fb27SDimitry Andric /// 361906c3fb27SDimitry Andric /// \code{.operation} 362006c3fb27SDimitry Andric /// FOR i := 0 TO 7 362106c3fb27SDimitry Andric /// j := i*32 362206c3fb27SDimitry Andric /// IF __M[j+31] == 1 362306c3fb27SDimitry Andric /// Store32(__X+(i*4), __Y[j+31:j]) 362406c3fb27SDimitry Andric /// FI 362506c3fb27SDimitry Andric /// ENDFOR 362606c3fb27SDimitry Andric /// \endcode 362706c3fb27SDimitry Andric /// 362806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 362906c3fb27SDimitry Andric /// 363006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 363106c3fb27SDimitry Andric /// 363206c3fb27SDimitry Andric /// \param __X 363306c3fb27SDimitry Andric /// A pointer to the memory used for storing values. 363406c3fb27SDimitry Andric /// \param __M 363506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the mask bits. 363606c3fb27SDimitry Andric /// \param __Y 363706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the values to store. 36380b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS256 36390b57cec5SDimitry Andric _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 36400b57cec5SDimitry Andric { 36410b57cec5SDimitry Andric __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 36420b57cec5SDimitry Andric } 36430b57cec5SDimitry Andric 364406c3fb27SDimitry Andric /// Conditionally stores four 64-bit integer elements from the 256-bit vector 364506c3fb27SDimitry Andric /// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of 364606c3fb27SDimitry Andric /// the corresponding element in the mask \a __M is set; otherwise, the 364706c3fb27SDimitry Andric /// memory element is unchanged. 364806c3fb27SDimitry Andric /// 364906c3fb27SDimitry Andric /// \code{.operation} 365006c3fb27SDimitry Andric /// FOR i := 0 TO 3 365106c3fb27SDimitry Andric /// j := i*64 365206c3fb27SDimitry Andric /// IF __M[j+63] == 1 365306c3fb27SDimitry Andric /// Store64(__X+(i*8), __Y[j+63:j]) 365406c3fb27SDimitry Andric /// FI 365506c3fb27SDimitry Andric /// ENDFOR 365606c3fb27SDimitry Andric /// \endcode 365706c3fb27SDimitry Andric /// 365806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 365906c3fb27SDimitry Andric /// 366006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 366106c3fb27SDimitry Andric /// 366206c3fb27SDimitry Andric /// \param __X 366306c3fb27SDimitry Andric /// A pointer to the memory used for storing values. 366406c3fb27SDimitry Andric /// \param __M 366506c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the mask bits. 366606c3fb27SDimitry Andric /// \param __Y 366706c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the values to store. 36680b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS256 36690b57cec5SDimitry Andric _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 36700b57cec5SDimitry Andric { 36710b57cec5SDimitry Andric __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 36720b57cec5SDimitry Andric } 36730b57cec5SDimitry Andric 367406c3fb27SDimitry Andric /// Conditionally stores four 32-bit integer elements from the 128-bit vector 367506c3fb27SDimitry Andric /// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of 367606c3fb27SDimitry Andric /// the corresponding element in the mask \a __M is set; otherwise, the 367706c3fb27SDimitry Andric /// memory element is unchanged. 367806c3fb27SDimitry Andric /// 367906c3fb27SDimitry Andric /// \code{.operation} 368006c3fb27SDimitry Andric /// FOR i := 0 TO 3 368106c3fb27SDimitry Andric /// j := i*32 368206c3fb27SDimitry Andric /// IF __M[j+31] == 1 368306c3fb27SDimitry Andric /// Store32(__X+(i*4), __Y[j+31:j]) 368406c3fb27SDimitry Andric /// FI 368506c3fb27SDimitry Andric /// ENDFOR 368606c3fb27SDimitry Andric /// \endcode 368706c3fb27SDimitry Andric /// 368806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 368906c3fb27SDimitry Andric /// 369006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVD instruction. 369106c3fb27SDimitry Andric /// 369206c3fb27SDimitry Andric /// \param __X 369306c3fb27SDimitry Andric /// A pointer to the memory used for storing values. 369406c3fb27SDimitry Andric /// \param __M 369506c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the mask bits. 369606c3fb27SDimitry Andric /// \param __Y 369706c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the values to store. 36980b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS128 36990b57cec5SDimitry Andric _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 37000b57cec5SDimitry Andric { 37010b57cec5SDimitry Andric __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 37020b57cec5SDimitry Andric } 37030b57cec5SDimitry Andric 370406c3fb27SDimitry Andric /// Conditionally stores two 64-bit integer elements from the 128-bit vector 370506c3fb27SDimitry Andric /// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of 370606c3fb27SDimitry Andric /// the corresponding element in the mask \a __M is set; otherwise, the 370706c3fb27SDimitry Andric /// memory element is unchanged. 370806c3fb27SDimitry Andric /// 370906c3fb27SDimitry Andric /// \code{.operation} 371006c3fb27SDimitry Andric /// FOR i := 0 TO 1 371106c3fb27SDimitry Andric /// j := i*64 371206c3fb27SDimitry Andric /// IF __M[j+63] == 1 371306c3fb27SDimitry Andric /// Store64(__X+(i*8), __Y[j+63:j]) 371406c3fb27SDimitry Andric /// FI 371506c3fb27SDimitry Andric /// ENDFOR 371606c3fb27SDimitry Andric /// \endcode 371706c3fb27SDimitry Andric /// 371806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 371906c3fb27SDimitry Andric /// 372006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPMASKMOVQ instruction. 372106c3fb27SDimitry Andric /// 372206c3fb27SDimitry Andric /// \param __X 372306c3fb27SDimitry Andric /// A pointer to the memory used for storing values. 372406c3fb27SDimitry Andric /// \param __M 372506c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the mask bits. 372606c3fb27SDimitry Andric /// \param __Y 372706c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the values to store. 37280b57cec5SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS128 37290b57cec5SDimitry Andric _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 37300b57cec5SDimitry Andric { 37310b57cec5SDimitry Andric __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 37320b57cec5SDimitry Andric } 37330b57cec5SDimitry Andric 373406c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 373506c3fb27SDimitry Andric /// left by the number of bits given in the corresponding element of the 373606c3fb27SDimitry Andric /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 373706c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 373806c3fb27SDimitry Andric /// 31, the result for that element is zero. 373906c3fb27SDimitry Andric /// 374006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 374106c3fb27SDimitry Andric /// 374206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVD instruction. 374306c3fb27SDimitry Andric /// 374406c3fb27SDimitry Andric /// \param __X 374506c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 374606c3fb27SDimitry Andric /// \param __Y 374706c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 374806c3fb27SDimitry Andric /// bits). 374906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 37500b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 37510b57cec5SDimitry Andric _mm256_sllv_epi32(__m256i __X, __m256i __Y) 37520b57cec5SDimitry Andric { 37530b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 37540b57cec5SDimitry Andric } 37550b57cec5SDimitry Andric 375606c3fb27SDimitry Andric /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 375706c3fb27SDimitry Andric /// left by the number of bits given in the corresponding element of the 375806c3fb27SDimitry Andric /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 375906c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 376006c3fb27SDimitry Andric /// 31, the result for that element is zero. 376106c3fb27SDimitry Andric /// 376206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 376306c3fb27SDimitry Andric /// 376406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVD instruction. 376506c3fb27SDimitry Andric /// 376606c3fb27SDimitry Andric /// \param __X 376706c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] to be shifted. 376806c3fb27SDimitry Andric /// \param __Y 376906c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 377006c3fb27SDimitry Andric /// bits). 377106c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result. 37720b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 37730b57cec5SDimitry Andric _mm_sllv_epi32(__m128i __X, __m128i __Y) 37740b57cec5SDimitry Andric { 37750b57cec5SDimitry Andric return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 37760b57cec5SDimitry Andric } 37770b57cec5SDimitry Andric 377806c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 377906c3fb27SDimitry Andric /// left by the number of bits given in the corresponding element of the 378006c3fb27SDimitry Andric /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 378106c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 378206c3fb27SDimitry Andric /// 63, the result for that element is zero. 378306c3fb27SDimitry Andric /// 378406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 378506c3fb27SDimitry Andric /// 378606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVQ instruction. 378706c3fb27SDimitry Andric /// 378806c3fb27SDimitry Andric /// \param __X 378906c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] to be shifted. 379006c3fb27SDimitry Andric /// \param __Y 379106c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 379206c3fb27SDimitry Andric /// bits). 379306c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 37940b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 37950b57cec5SDimitry Andric _mm256_sllv_epi64(__m256i __X, __m256i __Y) 37960b57cec5SDimitry Andric { 37970b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); 37980b57cec5SDimitry Andric } 37990b57cec5SDimitry Andric 380006c3fb27SDimitry Andric /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 380106c3fb27SDimitry Andric /// left by the number of bits given in the corresponding element of the 380206c3fb27SDimitry Andric /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 380306c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 380406c3fb27SDimitry Andric /// 63, the result for that element is zero. 380506c3fb27SDimitry Andric /// 380606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 380706c3fb27SDimitry Andric /// 380806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSLLVQ instruction. 380906c3fb27SDimitry Andric /// 381006c3fb27SDimitry Andric /// \param __X 381106c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] to be shifted. 381206c3fb27SDimitry Andric /// \param __Y 381306c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 381406c3fb27SDimitry Andric /// bits). 381506c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the result. 38160b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 38170b57cec5SDimitry Andric _mm_sllv_epi64(__m128i __X, __m128i __Y) 38180b57cec5SDimitry Andric { 38190b57cec5SDimitry Andric return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); 38200b57cec5SDimitry Andric } 38210b57cec5SDimitry Andric 382206c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 382306c3fb27SDimitry Andric /// right by the number of bits given in the corresponding element of the 382406c3fb27SDimitry Andric /// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and 382506c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 382606c3fb27SDimitry Andric /// 31, the result for that element is 0 or -1 according to the sign bit 382706c3fb27SDimitry Andric /// for that element. 382806c3fb27SDimitry Andric /// 382906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 383006c3fb27SDimitry Andric /// 383106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAVD instruction. 383206c3fb27SDimitry Andric /// 383306c3fb27SDimitry Andric /// \param __X 383406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 383506c3fb27SDimitry Andric /// \param __Y 383606c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 383706c3fb27SDimitry Andric /// bits). 383806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 38390b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 38400b57cec5SDimitry Andric _mm256_srav_epi32(__m256i __X, __m256i __Y) 38410b57cec5SDimitry Andric { 38420b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 38430b57cec5SDimitry Andric } 38440b57cec5SDimitry Andric 384506c3fb27SDimitry Andric /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 384606c3fb27SDimitry Andric /// right by the number of bits given in the corresponding element of the 384706c3fb27SDimitry Andric /// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and 384806c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 384906c3fb27SDimitry Andric /// 31, the result for that element is 0 or -1 according to the sign bit 385006c3fb27SDimitry Andric /// for that element. 385106c3fb27SDimitry Andric /// 385206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 385306c3fb27SDimitry Andric /// 385406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRAVD instruction. 385506c3fb27SDimitry Andric /// 385606c3fb27SDimitry Andric /// \param __X 385706c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] to be shifted. 385806c3fb27SDimitry Andric /// \param __Y 385906c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 386006c3fb27SDimitry Andric /// bits). 386106c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result. 38620b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 38630b57cec5SDimitry Andric _mm_srav_epi32(__m128i __X, __m128i __Y) 38640b57cec5SDimitry Andric { 38650b57cec5SDimitry Andric return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 38660b57cec5SDimitry Andric } 38670b57cec5SDimitry Andric 386806c3fb27SDimitry Andric /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X 386906c3fb27SDimitry Andric /// right by the number of bits given in the corresponding element of the 387006c3fb27SDimitry Andric /// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and 387106c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 387206c3fb27SDimitry Andric /// 31, the result for that element is zero. 387306c3fb27SDimitry Andric /// 387406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 387506c3fb27SDimitry Andric /// 387606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVD instruction. 387706c3fb27SDimitry Andric /// 387806c3fb27SDimitry Andric /// \param __X 387906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] to be shifted. 388006c3fb27SDimitry Andric /// \param __Y 388106c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in 388206c3fb27SDimitry Andric /// bits). 388306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the result. 38840b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 38850b57cec5SDimitry Andric _mm256_srlv_epi32(__m256i __X, __m256i __Y) 38860b57cec5SDimitry Andric { 38870b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 38880b57cec5SDimitry Andric } 38890b57cec5SDimitry Andric 389006c3fb27SDimitry Andric /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X 389106c3fb27SDimitry Andric /// right by the number of bits given in the corresponding element of the 389206c3fb27SDimitry Andric /// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and 389306c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 389406c3fb27SDimitry Andric /// 31, the result for that element is zero. 389506c3fb27SDimitry Andric /// 389606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 389706c3fb27SDimitry Andric /// 389806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVD instruction. 389906c3fb27SDimitry Andric /// 390006c3fb27SDimitry Andric /// \param __X 390106c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] to be shifted. 390206c3fb27SDimitry Andric /// \param __Y 390306c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in 390406c3fb27SDimitry Andric /// bits). 390506c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the result. 39060b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 39070b57cec5SDimitry Andric _mm_srlv_epi32(__m128i __X, __m128i __Y) 39080b57cec5SDimitry Andric { 39090b57cec5SDimitry Andric return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 39100b57cec5SDimitry Andric } 39110b57cec5SDimitry Andric 391206c3fb27SDimitry Andric /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X 391306c3fb27SDimitry Andric /// right by the number of bits given in the corresponding element of the 391406c3fb27SDimitry Andric /// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and 391506c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 391606c3fb27SDimitry Andric /// 63, the result for that element is zero. 391706c3fb27SDimitry Andric /// 391806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 391906c3fb27SDimitry Andric /// 392006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVQ instruction. 392106c3fb27SDimitry Andric /// 392206c3fb27SDimitry Andric /// \param __X 392306c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] to be shifted. 392406c3fb27SDimitry Andric /// \param __Y 392506c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in 392606c3fb27SDimitry Andric /// bits). 392706c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the result. 39280b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 39290b57cec5SDimitry Andric _mm256_srlv_epi64(__m256i __X, __m256i __Y) 39300b57cec5SDimitry Andric { 39310b57cec5SDimitry Andric return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); 39320b57cec5SDimitry Andric } 39330b57cec5SDimitry Andric 393406c3fb27SDimitry Andric /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X 393506c3fb27SDimitry Andric /// right by the number of bits given in the corresponding element of the 393606c3fb27SDimitry Andric /// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and 393706c3fb27SDimitry Andric /// returns the result. If the shift count for any element is greater than 393806c3fb27SDimitry Andric /// 63, the result for that element is zero. 393906c3fb27SDimitry Andric /// 394006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 394106c3fb27SDimitry Andric /// 394206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPSRLVQ instruction. 394306c3fb27SDimitry Andric /// 394406c3fb27SDimitry Andric /// \param __X 394506c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] to be shifted. 394606c3fb27SDimitry Andric /// \param __Y 394706c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in 394806c3fb27SDimitry Andric /// bits). 394906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the result. 39500b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 39510b57cec5SDimitry Andric _mm_srlv_epi64(__m128i __X, __m128i __Y) 39520b57cec5SDimitry Andric { 39530b57cec5SDimitry Andric return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); 39540b57cec5SDimitry Andric } 39550b57cec5SDimitry Andric 395606c3fb27SDimitry Andric /// Conditionally gathers two 64-bit floating-point values, either from the 395706c3fb27SDimitry Andric /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 395806c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 395906c3fb27SDimitry Andric /// of [2 x double] in \a mask determines the source for each element. 396006c3fb27SDimitry Andric /// 396106c3fb27SDimitry Andric /// \code{.operation} 396206c3fb27SDimitry Andric /// FOR element := 0 to 1 396306c3fb27SDimitry Andric /// j := element*64 396406c3fb27SDimitry Andric /// k := element*32 396506c3fb27SDimitry Andric /// IF mask[j+63] == 0 396606c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 396706c3fb27SDimitry Andric /// ELSE 396806c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 396906c3fb27SDimitry Andric /// FI 397006c3fb27SDimitry Andric /// ENDFOR 397106c3fb27SDimitry Andric /// \endcode 397206c3fb27SDimitry Andric /// 397306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 397406c3fb27SDimitry Andric /// 397506c3fb27SDimitry Andric /// \code 397606c3fb27SDimitry Andric /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i, 397706c3fb27SDimitry Andric /// __m128d mask, const int s); 397806c3fb27SDimitry Andric /// \endcode 397906c3fb27SDimitry Andric /// 398006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction. 398106c3fb27SDimitry Andric /// 398206c3fb27SDimitry Andric /// \param a 398306c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] used as the source when a mask bit is 398406c3fb27SDimitry Andric /// zero. 398506c3fb27SDimitry Andric /// \param m 398606c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 398706c3fb27SDimitry Andric /// \param i 398806c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 398906c3fb27SDimitry Andric /// the first two elements are used. 399006c3fb27SDimitry Andric /// \param mask 399106c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the mask. The most 399206c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 399306c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 399406c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 399506c3fb27SDimitry Andric /// \param s 399606c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 399706c3fb27SDimitry Andric /// 1, 2, 4, or 8. 399806c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values. 39990b57cec5SDimitry Andric #define _mm_mask_i32gather_pd(a, m, i, mask, s) \ 4000349cc55cSDimitry Andric ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ 40010b57cec5SDimitry Andric (double const *)(m), \ 40020b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 4003349cc55cSDimitry Andric (__v2df)(__m128d)(mask), (s))) 40040b57cec5SDimitry Andric 400506c3fb27SDimitry Andric /// Conditionally gathers four 64-bit floating-point values, either from the 400606c3fb27SDimitry Andric /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 400706c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 400806c3fb27SDimitry Andric /// of [4 x double] in \a mask determines the source for each element. 400906c3fb27SDimitry Andric /// 401006c3fb27SDimitry Andric /// \code{.operation} 401106c3fb27SDimitry Andric /// FOR element := 0 to 3 401206c3fb27SDimitry Andric /// j := element*64 401306c3fb27SDimitry Andric /// k := element*32 401406c3fb27SDimitry Andric /// IF mask[j+63] == 0 401506c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 401606c3fb27SDimitry Andric /// ELSE 401706c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 401806c3fb27SDimitry Andric /// FI 401906c3fb27SDimitry Andric /// ENDFOR 402006c3fb27SDimitry Andric /// \endcode 402106c3fb27SDimitry Andric /// 402206c3fb27SDimitry Andric /// \headerfile <immintrin.h> 402306c3fb27SDimitry Andric /// 402406c3fb27SDimitry Andric /// \code 402506c3fb27SDimitry Andric /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i, 402606c3fb27SDimitry Andric /// __m256d mask, const int s); 402706c3fb27SDimitry Andric /// \endcode 402806c3fb27SDimitry Andric /// 402906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction. 403006c3fb27SDimitry Andric /// 403106c3fb27SDimitry Andric /// \param a 403206c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] used as the source when a mask bit is 403306c3fb27SDimitry Andric /// zero. 403406c3fb27SDimitry Andric /// \param m 403506c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 403606c3fb27SDimitry Andric /// \param i 403706c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 403806c3fb27SDimitry Andric /// \param mask 403906c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the mask. The most 404006c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 404106c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 404206c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 404306c3fb27SDimitry Andric /// \param s 404406c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 404506c3fb27SDimitry Andric /// 1, 2, 4, or 8. 404606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values. 40470b57cec5SDimitry Andric #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ 4048349cc55cSDimitry Andric ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ 40490b57cec5SDimitry Andric (double const *)(m), \ 40500b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 4051349cc55cSDimitry Andric (__v4df)(__m256d)(mask), (s))) 40520b57cec5SDimitry Andric 405306c3fb27SDimitry Andric /// Conditionally gathers two 64-bit floating-point values, either from the 405406c3fb27SDimitry Andric /// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled 405506c3fb27SDimitry Andric /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 405606c3fb27SDimitry Andric /// of [2 x double] in \a mask determines the source for each element. 405706c3fb27SDimitry Andric /// 405806c3fb27SDimitry Andric /// \code{.operation} 405906c3fb27SDimitry Andric /// FOR element := 0 to 1 406006c3fb27SDimitry Andric /// j := element*64 406106c3fb27SDimitry Andric /// k := element*64 406206c3fb27SDimitry Andric /// IF mask[j+63] == 0 406306c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 406406c3fb27SDimitry Andric /// ELSE 406506c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 406606c3fb27SDimitry Andric /// FI 406706c3fb27SDimitry Andric /// ENDFOR 406806c3fb27SDimitry Andric /// \endcode 406906c3fb27SDimitry Andric /// 407006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 407106c3fb27SDimitry Andric /// 407206c3fb27SDimitry Andric /// \code 407306c3fb27SDimitry Andric /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i, 407406c3fb27SDimitry Andric /// __m128d mask, const int s); 407506c3fb27SDimitry Andric /// \endcode 407606c3fb27SDimitry Andric /// 407706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction. 407806c3fb27SDimitry Andric /// 407906c3fb27SDimitry Andric /// \param a 408006c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] used as the source when a mask bit is 408106c3fb27SDimitry Andric /// zero. 408206c3fb27SDimitry Andric /// \param m 408306c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 408406c3fb27SDimitry Andric /// \param i 408506c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 408606c3fb27SDimitry Andric /// \param mask 408706c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the mask. The most 408806c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 408906c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 409006c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 409106c3fb27SDimitry Andric /// \param s 409206c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 409306c3fb27SDimitry Andric /// 1, 2, 4, or 8. 409406c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values. 40950b57cec5SDimitry Andric #define _mm_mask_i64gather_pd(a, m, i, mask, s) \ 4096349cc55cSDimitry Andric ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ 40970b57cec5SDimitry Andric (double const *)(m), \ 40980b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 4099349cc55cSDimitry Andric (__v2df)(__m128d)(mask), (s))) 41000b57cec5SDimitry Andric 410106c3fb27SDimitry Andric /// Conditionally gathers four 64-bit floating-point values, either from the 410206c3fb27SDimitry Andric /// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled 410306c3fb27SDimitry Andric /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 410406c3fb27SDimitry Andric /// of [4 x double] in \a mask determines the source for each element. 410506c3fb27SDimitry Andric /// 410606c3fb27SDimitry Andric /// \code{.operation} 410706c3fb27SDimitry Andric /// FOR element := 0 to 3 410806c3fb27SDimitry Andric /// j := element*64 410906c3fb27SDimitry Andric /// k := element*64 411006c3fb27SDimitry Andric /// IF mask[j+63] == 0 411106c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 411206c3fb27SDimitry Andric /// ELSE 411306c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 411406c3fb27SDimitry Andric /// FI 411506c3fb27SDimitry Andric /// ENDFOR 411606c3fb27SDimitry Andric /// \endcode 411706c3fb27SDimitry Andric /// 411806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 411906c3fb27SDimitry Andric /// 412006c3fb27SDimitry Andric /// \code 412106c3fb27SDimitry Andric /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i, 412206c3fb27SDimitry Andric /// __m256d mask, const int s); 412306c3fb27SDimitry Andric /// \endcode 412406c3fb27SDimitry Andric /// 412506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction. 412606c3fb27SDimitry Andric /// 412706c3fb27SDimitry Andric /// \param a 412806c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] used as the source when a mask bit is 412906c3fb27SDimitry Andric /// zero. 413006c3fb27SDimitry Andric /// \param m 413106c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 413206c3fb27SDimitry Andric /// \param i 413306c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 413406c3fb27SDimitry Andric /// \param mask 413506c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the mask. The most 413606c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 413706c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 413806c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 413906c3fb27SDimitry Andric /// \param s 414006c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 414106c3fb27SDimitry Andric /// 1, 2, 4, or 8. 414206c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values. 41430b57cec5SDimitry Andric #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ 4144349cc55cSDimitry Andric ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ 41450b57cec5SDimitry Andric (double const *)(m), \ 41460b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 4147349cc55cSDimitry Andric (__v4df)(__m256d)(mask), (s))) 41480b57cec5SDimitry Andric 414906c3fb27SDimitry Andric /// Conditionally gathers four 32-bit floating-point values, either from the 415006c3fb27SDimitry Andric /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 415106c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 415206c3fb27SDimitry Andric /// of [4 x float] in \a mask determines the source for each element. 415306c3fb27SDimitry Andric /// 415406c3fb27SDimitry Andric /// \code{.operation} 415506c3fb27SDimitry Andric /// FOR element := 0 to 3 415606c3fb27SDimitry Andric /// j := element*32 415706c3fb27SDimitry Andric /// k := element*32 415806c3fb27SDimitry Andric /// IF mask[j+31] == 0 415906c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 416006c3fb27SDimitry Andric /// ELSE 416106c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 416206c3fb27SDimitry Andric /// FI 416306c3fb27SDimitry Andric /// ENDFOR 416406c3fb27SDimitry Andric /// \endcode 416506c3fb27SDimitry Andric /// 416606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 416706c3fb27SDimitry Andric /// 416806c3fb27SDimitry Andric /// \code 416906c3fb27SDimitry Andric /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i, 417006c3fb27SDimitry Andric /// __m128 mask, const int s); 417106c3fb27SDimitry Andric /// \endcode 417206c3fb27SDimitry Andric /// 417306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction. 417406c3fb27SDimitry Andric /// 417506c3fb27SDimitry Andric /// \param a 417606c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] used as the source when a mask bit is 417706c3fb27SDimitry Andric /// zero. 417806c3fb27SDimitry Andric /// \param m 417906c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 418006c3fb27SDimitry Andric /// \param i 418106c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 418206c3fb27SDimitry Andric /// \param mask 418306c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the mask. The most 418406c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 418506c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 418606c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 418706c3fb27SDimitry Andric /// \param s 418806c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 418906c3fb27SDimitry Andric /// 1, 2, 4, or 8. 419006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values. 41910b57cec5SDimitry Andric #define _mm_mask_i32gather_ps(a, m, i, mask, s) \ 4192349cc55cSDimitry Andric ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ 41930b57cec5SDimitry Andric (float const *)(m), \ 41940b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 4195349cc55cSDimitry Andric (__v4sf)(__m128)(mask), (s))) 41960b57cec5SDimitry Andric 419706c3fb27SDimitry Andric /// Conditionally gathers eight 32-bit floating-point values, either from the 419806c3fb27SDimitry Andric /// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled 419906c3fb27SDimitry Andric /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 420006c3fb27SDimitry Andric /// of [8 x float] in \a mask determines the source for each element. 420106c3fb27SDimitry Andric /// 420206c3fb27SDimitry Andric /// \code{.operation} 420306c3fb27SDimitry Andric /// FOR element := 0 to 7 420406c3fb27SDimitry Andric /// j := element*32 420506c3fb27SDimitry Andric /// k := element*32 420606c3fb27SDimitry Andric /// IF mask[j+31] == 0 420706c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 420806c3fb27SDimitry Andric /// ELSE 420906c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 421006c3fb27SDimitry Andric /// FI 421106c3fb27SDimitry Andric /// ENDFOR 421206c3fb27SDimitry Andric /// \endcode 421306c3fb27SDimitry Andric /// 421406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 421506c3fb27SDimitry Andric /// 421606c3fb27SDimitry Andric /// \code 421706c3fb27SDimitry Andric /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i, 421806c3fb27SDimitry Andric /// __m256 mask, const int s); 421906c3fb27SDimitry Andric /// \endcode 422006c3fb27SDimitry Andric /// 422106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction. 422206c3fb27SDimitry Andric /// 422306c3fb27SDimitry Andric /// \param a 422406c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] used as the source when a mask bit is 422506c3fb27SDimitry Andric /// zero. 422606c3fb27SDimitry Andric /// \param m 422706c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 422806c3fb27SDimitry Andric /// \param i 422906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 423006c3fb27SDimitry Andric /// \param mask 423106c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the mask. The most 423206c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 423306c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 423406c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 423506c3fb27SDimitry Andric /// \param s 423606c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 423706c3fb27SDimitry Andric /// 1, 2, 4, or 8. 423806c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the gathered values. 42390b57cec5SDimitry Andric #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ 4240349cc55cSDimitry Andric ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ 42410b57cec5SDimitry Andric (float const *)(m), \ 42420b57cec5SDimitry Andric (__v8si)(__m256i)(i), \ 4243349cc55cSDimitry Andric (__v8sf)(__m256)(mask), (s))) 42440b57cec5SDimitry Andric 424506c3fb27SDimitry Andric /// Conditionally gathers two 32-bit floating-point values, either from the 424606c3fb27SDimitry Andric /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 424706c3fb27SDimitry Andric /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 424806c3fb27SDimitry Andric /// of [4 x float] in \a mask determines the source for the lower two 424906c3fb27SDimitry Andric /// elements. The upper two elements of the result are zeroed. 425006c3fb27SDimitry Andric /// 425106c3fb27SDimitry Andric /// \code{.operation} 425206c3fb27SDimitry Andric /// FOR element := 0 to 1 425306c3fb27SDimitry Andric /// j := element*32 425406c3fb27SDimitry Andric /// k := element*64 425506c3fb27SDimitry Andric /// IF mask[j+31] == 0 425606c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 425706c3fb27SDimitry Andric /// ELSE 425806c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 425906c3fb27SDimitry Andric /// FI 426006c3fb27SDimitry Andric /// ENDFOR 426106c3fb27SDimitry Andric /// result[127:64] := 0 426206c3fb27SDimitry Andric /// \endcode 426306c3fb27SDimitry Andric /// 426406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 426506c3fb27SDimitry Andric /// 426606c3fb27SDimitry Andric /// \code 426706c3fb27SDimitry Andric /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i, 426806c3fb27SDimitry Andric /// __m128 mask, const int s); 426906c3fb27SDimitry Andric /// \endcode 427006c3fb27SDimitry Andric /// 427106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction. 427206c3fb27SDimitry Andric /// 427306c3fb27SDimitry Andric /// \param a 427406c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] used as the source when a mask bit is 427506c3fb27SDimitry Andric /// zero. Only the first two elements are used. 427606c3fb27SDimitry Andric /// \param m 427706c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 427806c3fb27SDimitry Andric /// \param i 427906c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 428006c3fb27SDimitry Andric /// \param mask 428106c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the mask. The most 428206c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 428306c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 428406c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. Only the first 428506c3fb27SDimitry Andric /// two elements are used. 428606c3fb27SDimitry Andric /// \param s 428706c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 428806c3fb27SDimitry Andric /// 1, 2, 4, or 8. 428906c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values. 42900b57cec5SDimitry Andric #define _mm_mask_i64gather_ps(a, m, i, mask, s) \ 4291349cc55cSDimitry Andric ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ 42920b57cec5SDimitry Andric (float const *)(m), \ 42930b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 4294349cc55cSDimitry Andric (__v4sf)(__m128)(mask), (s))) 42950b57cec5SDimitry Andric 429606c3fb27SDimitry Andric /// Conditionally gathers four 32-bit floating-point values, either from the 429706c3fb27SDimitry Andric /// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled 429806c3fb27SDimitry Andric /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 429906c3fb27SDimitry Andric /// of [4 x float] in \a mask determines the source for each element. 430006c3fb27SDimitry Andric /// 430106c3fb27SDimitry Andric /// \code{.operation} 430206c3fb27SDimitry Andric /// FOR element := 0 to 3 430306c3fb27SDimitry Andric /// j := element*32 430406c3fb27SDimitry Andric /// k := element*64 430506c3fb27SDimitry Andric /// IF mask[j+31] == 0 430606c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 430706c3fb27SDimitry Andric /// ELSE 430806c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 430906c3fb27SDimitry Andric /// FI 431006c3fb27SDimitry Andric /// ENDFOR 431106c3fb27SDimitry Andric /// \endcode 431206c3fb27SDimitry Andric /// 431306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 431406c3fb27SDimitry Andric /// 431506c3fb27SDimitry Andric /// \code 431606c3fb27SDimitry Andric /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i, 431706c3fb27SDimitry Andric /// __m128 mask, const int s); 431806c3fb27SDimitry Andric /// \endcode 431906c3fb27SDimitry Andric /// 432006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction. 432106c3fb27SDimitry Andric /// 432206c3fb27SDimitry Andric /// \param a 432306c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] used as the source when a mask bit is 432406c3fb27SDimitry Andric /// zero. 432506c3fb27SDimitry Andric /// \param m 432606c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 432706c3fb27SDimitry Andric /// \param i 432806c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 432906c3fb27SDimitry Andric /// \param mask 433006c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the mask. The most 433106c3fb27SDimitry Andric /// significant bit of each element in the mask vector represents the mask 433206c3fb27SDimitry Andric /// bits. If a mask bit is zero, the corresponding value from vector \a a 433306c3fb27SDimitry Andric /// is gathered; otherwise the value is loaded from memory. 433406c3fb27SDimitry Andric /// \param s 433506c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 433606c3fb27SDimitry Andric /// 1, 2, 4, or 8. 433706c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values. 43380b57cec5SDimitry Andric #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ 4339349cc55cSDimitry Andric ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ 43400b57cec5SDimitry Andric (float const *)(m), \ 43410b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 4342349cc55cSDimitry Andric (__v4sf)(__m128)(mask), (s))) 43430b57cec5SDimitry Andric 434406c3fb27SDimitry Andric /// Conditionally gathers four 32-bit integer values, either from the 434506c3fb27SDimitry Andric /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 434606c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 434706c3fb27SDimitry Andric /// of [4 x i32] in \a mask determines the source for each element. 434806c3fb27SDimitry Andric /// 434906c3fb27SDimitry Andric /// \code{.operation} 435006c3fb27SDimitry Andric /// FOR element := 0 to 3 435106c3fb27SDimitry Andric /// j := element*32 435206c3fb27SDimitry Andric /// k := element*32 435306c3fb27SDimitry Andric /// IF mask[j+31] == 0 435406c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 435506c3fb27SDimitry Andric /// ELSE 435606c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 435706c3fb27SDimitry Andric /// FI 435806c3fb27SDimitry Andric /// ENDFOR 435906c3fb27SDimitry Andric /// \endcode 436006c3fb27SDimitry Andric /// 436106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 436206c3fb27SDimitry Andric /// 436306c3fb27SDimitry Andric /// \code 436406c3fb27SDimitry Andric /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i, 436506c3fb27SDimitry Andric /// __m128i mask, const int s); 436606c3fb27SDimitry Andric /// \endcode 436706c3fb27SDimitry Andric /// 436806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction. 436906c3fb27SDimitry Andric /// 437006c3fb27SDimitry Andric /// \param a 437106c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 437206c3fb27SDimitry Andric /// zero. 437306c3fb27SDimitry Andric /// \param m 437406c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 437506c3fb27SDimitry Andric /// \param i 437606c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 437706c3fb27SDimitry Andric /// \param mask 437806c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the mask. The most significant 437906c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 438006c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 438106c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 438206c3fb27SDimitry Andric /// \param s 438306c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 438406c3fb27SDimitry Andric /// 1, 2, 4, or 8. 438506c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 43860b57cec5SDimitry Andric #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ 4387349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ 43880b57cec5SDimitry Andric (int const *)(m), \ 43890b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 4390349cc55cSDimitry Andric (__v4si)(__m128i)(mask), (s))) 43910b57cec5SDimitry Andric 439206c3fb27SDimitry Andric /// Conditionally gathers eight 32-bit integer values, either from the 439306c3fb27SDimitry Andric /// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled 439406c3fb27SDimitry Andric /// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector 439506c3fb27SDimitry Andric /// of [8 x i32] in \a mask determines the source for each element. 439606c3fb27SDimitry Andric /// 439706c3fb27SDimitry Andric /// \code{.operation} 439806c3fb27SDimitry Andric /// FOR element := 0 to 7 439906c3fb27SDimitry Andric /// j := element*32 440006c3fb27SDimitry Andric /// k := element*32 440106c3fb27SDimitry Andric /// IF mask[j+31] == 0 440206c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 440306c3fb27SDimitry Andric /// ELSE 440406c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 440506c3fb27SDimitry Andric /// FI 440606c3fb27SDimitry Andric /// ENDFOR 440706c3fb27SDimitry Andric /// \endcode 440806c3fb27SDimitry Andric /// 440906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 441006c3fb27SDimitry Andric /// 441106c3fb27SDimitry Andric /// \code 441206c3fb27SDimitry Andric /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i, 441306c3fb27SDimitry Andric /// __m256i mask, const int s); 441406c3fb27SDimitry Andric /// \endcode 441506c3fb27SDimitry Andric /// 441606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction. 441706c3fb27SDimitry Andric /// 441806c3fb27SDimitry Andric /// \param a 441906c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] used as the source when a mask bit is 442006c3fb27SDimitry Andric /// zero. 442106c3fb27SDimitry Andric /// \param m 442206c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 442306c3fb27SDimitry Andric /// \param i 442406c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 442506c3fb27SDimitry Andric /// \param mask 442606c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing the mask. The most significant 442706c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 442806c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 442906c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 443006c3fb27SDimitry Andric /// \param s 443106c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 443206c3fb27SDimitry Andric /// 1, 2, 4, or 8. 443306c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 44340b57cec5SDimitry Andric #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ 4435349cc55cSDimitry Andric ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ 44360b57cec5SDimitry Andric (int const *)(m), \ 44370b57cec5SDimitry Andric (__v8si)(__m256i)(i), \ 4438349cc55cSDimitry Andric (__v8si)(__m256i)(mask), (s))) 44390b57cec5SDimitry Andric 444006c3fb27SDimitry Andric /// Conditionally gathers two 32-bit integer values, either from the 444106c3fb27SDimitry Andric /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 444206c3fb27SDimitry Andric /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 444306c3fb27SDimitry Andric /// of [4 x i32] in \a mask determines the source for the lower two 444406c3fb27SDimitry Andric /// elements. The upper two elements of the result are zeroed. 444506c3fb27SDimitry Andric /// 444606c3fb27SDimitry Andric /// \code{.operation} 444706c3fb27SDimitry Andric /// FOR element := 0 to 1 444806c3fb27SDimitry Andric /// j := element*32 444906c3fb27SDimitry Andric /// k := element*64 445006c3fb27SDimitry Andric /// IF mask[j+31] == 0 445106c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 445206c3fb27SDimitry Andric /// ELSE 445306c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 445406c3fb27SDimitry Andric /// FI 445506c3fb27SDimitry Andric /// ENDFOR 445606c3fb27SDimitry Andric /// result[127:64] := 0 445706c3fb27SDimitry Andric /// \endcode 445806c3fb27SDimitry Andric /// 445906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 446006c3fb27SDimitry Andric /// 446106c3fb27SDimitry Andric /// \code 446206c3fb27SDimitry Andric /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i, 446306c3fb27SDimitry Andric /// __m128i mask, const int s); 446406c3fb27SDimitry Andric /// \endcode 446506c3fb27SDimitry Andric /// 446606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction. 446706c3fb27SDimitry Andric /// 446806c3fb27SDimitry Andric /// \param a 446906c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 447006c3fb27SDimitry Andric /// zero. Only the first two elements are used. 447106c3fb27SDimitry Andric /// \param m 447206c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 447306c3fb27SDimitry Andric /// \param i 447406c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing indexes into \a m. 447506c3fb27SDimitry Andric /// \param mask 447606c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the mask. The most significant 447706c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 447806c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 447906c3fb27SDimitry Andric /// otherwise the value is loaded from memory. Only the first two elements 448006c3fb27SDimitry Andric /// are used. 448106c3fb27SDimitry Andric /// \param s 448206c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 448306c3fb27SDimitry Andric /// 1, 2, 4, or 8. 448406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 44850b57cec5SDimitry Andric #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ 4486349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ 44870b57cec5SDimitry Andric (int const *)(m), \ 44880b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 4489349cc55cSDimitry Andric (__v4si)(__m128i)(mask), (s))) 44900b57cec5SDimitry Andric 449106c3fb27SDimitry Andric /// Conditionally gathers four 32-bit integer values, either from the 449206c3fb27SDimitry Andric /// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled 449306c3fb27SDimitry Andric /// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector 449406c3fb27SDimitry Andric /// of [4 x i32] in \a mask determines the source for each element. 449506c3fb27SDimitry Andric /// 449606c3fb27SDimitry Andric /// \code{.operation} 449706c3fb27SDimitry Andric /// FOR element := 0 to 3 449806c3fb27SDimitry Andric /// j := element*32 449906c3fb27SDimitry Andric /// k := element*64 450006c3fb27SDimitry Andric /// IF mask[j+31] == 0 450106c3fb27SDimitry Andric /// result[j+31:j] := a[j+31:j] 450206c3fb27SDimitry Andric /// ELSE 450306c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 450406c3fb27SDimitry Andric /// FI 450506c3fb27SDimitry Andric /// ENDFOR 450606c3fb27SDimitry Andric /// \endcode 450706c3fb27SDimitry Andric /// 450806c3fb27SDimitry Andric /// \headerfile <immintrin.h> 450906c3fb27SDimitry Andric /// 451006c3fb27SDimitry Andric /// \code 451106c3fb27SDimitry Andric /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i, 451206c3fb27SDimitry Andric /// __m128i mask, const int s); 451306c3fb27SDimitry Andric /// \endcode 451406c3fb27SDimitry Andric /// 451506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction. 451606c3fb27SDimitry Andric /// 451706c3fb27SDimitry Andric /// \param a 451806c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] used as the source when a mask bit is 451906c3fb27SDimitry Andric /// zero. 452006c3fb27SDimitry Andric /// \param m 452106c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 452206c3fb27SDimitry Andric /// \param i 452306c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 452406c3fb27SDimitry Andric /// \param mask 452506c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing the mask. The most significant 452606c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 452706c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 452806c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 452906c3fb27SDimitry Andric /// \param s 453006c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 453106c3fb27SDimitry Andric /// 1, 2, 4, or 8. 453206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 45330b57cec5SDimitry Andric #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ 4534349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ 45350b57cec5SDimitry Andric (int const *)(m), \ 45360b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 4537349cc55cSDimitry Andric (__v4si)(__m128i)(mask), (s))) 45380b57cec5SDimitry Andric 453906c3fb27SDimitry Andric /// Conditionally gathers two 64-bit integer values, either from the 454006c3fb27SDimitry Andric /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 454106c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector 454206c3fb27SDimitry Andric /// of [2 x i64] in \a mask determines the source for each element. 454306c3fb27SDimitry Andric /// 454406c3fb27SDimitry Andric /// \code{.operation} 454506c3fb27SDimitry Andric /// FOR element := 0 to 1 454606c3fb27SDimitry Andric /// j := element*64 454706c3fb27SDimitry Andric /// k := element*32 454806c3fb27SDimitry Andric /// IF mask[j+63] == 0 454906c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 455006c3fb27SDimitry Andric /// ELSE 455106c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 455206c3fb27SDimitry Andric /// FI 455306c3fb27SDimitry Andric /// ENDFOR 455406c3fb27SDimitry Andric /// \endcode 455506c3fb27SDimitry Andric /// 455606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 455706c3fb27SDimitry Andric /// 455806c3fb27SDimitry Andric /// \code 455906c3fb27SDimitry Andric /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i, 456006c3fb27SDimitry Andric /// __m128i mask, const int s); 456106c3fb27SDimitry Andric /// \endcode 456206c3fb27SDimitry Andric /// 456306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 456406c3fb27SDimitry Andric /// 456506c3fb27SDimitry Andric /// \param a 456606c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 456706c3fb27SDimitry Andric /// zero. 456806c3fb27SDimitry Andric /// \param m 456906c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 457006c3fb27SDimitry Andric /// \param i 457106c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 457206c3fb27SDimitry Andric /// the first two elements are used. 457306c3fb27SDimitry Andric /// \param mask 457406c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the mask. The most significant 457506c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 457606c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 457706c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 457806c3fb27SDimitry Andric /// \param s 457906c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 458006c3fb27SDimitry Andric /// 1, 2, 4, or 8. 458106c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 45820b57cec5SDimitry Andric #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ 4583349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ 45840b57cec5SDimitry Andric (long long const *)(m), \ 45850b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 4586349cc55cSDimitry Andric (__v2di)(__m128i)(mask), (s))) 45870b57cec5SDimitry Andric 458806c3fb27SDimitry Andric /// Conditionally gathers four 64-bit integer values, either from the 458906c3fb27SDimitry Andric /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 459006c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector 459106c3fb27SDimitry Andric /// of [4 x i64] in \a mask determines the source for each element. 459206c3fb27SDimitry Andric /// 459306c3fb27SDimitry Andric /// \code{.operation} 459406c3fb27SDimitry Andric /// FOR element := 0 to 3 459506c3fb27SDimitry Andric /// j := element*64 459606c3fb27SDimitry Andric /// k := element*32 459706c3fb27SDimitry Andric /// IF mask[j+63] == 0 459806c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 459906c3fb27SDimitry Andric /// ELSE 460006c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 460106c3fb27SDimitry Andric /// FI 460206c3fb27SDimitry Andric /// ENDFOR 460306c3fb27SDimitry Andric /// \endcode 460406c3fb27SDimitry Andric /// 460506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 460606c3fb27SDimitry Andric /// 460706c3fb27SDimitry Andric /// \code 460806c3fb27SDimitry Andric /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m, 460906c3fb27SDimitry Andric /// __m128i i, __m256i mask, const int s); 461006c3fb27SDimitry Andric /// \endcode 461106c3fb27SDimitry Andric /// 461206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 461306c3fb27SDimitry Andric /// 461406c3fb27SDimitry Andric /// \param a 461506c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 461606c3fb27SDimitry Andric /// zero. 461706c3fb27SDimitry Andric /// \param m 461806c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 461906c3fb27SDimitry Andric /// \param i 462006c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 462106c3fb27SDimitry Andric /// \param mask 462206c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the mask. The most significant 462306c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 462406c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 462506c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 462606c3fb27SDimitry Andric /// \param s 462706c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 462806c3fb27SDimitry Andric /// 1, 2, 4, or 8. 462906c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 46300b57cec5SDimitry Andric #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ 4631349cc55cSDimitry Andric ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ 46320b57cec5SDimitry Andric (long long const *)(m), \ 46330b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 4634349cc55cSDimitry Andric (__v4di)(__m256i)(mask), (s))) 46350b57cec5SDimitry Andric 463606c3fb27SDimitry Andric /// Conditionally gathers two 64-bit integer values, either from the 463706c3fb27SDimitry Andric /// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled 463806c3fb27SDimitry Andric /// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector 463906c3fb27SDimitry Andric /// of [2 x i64] in \a mask determines the source for each element. 464006c3fb27SDimitry Andric /// 464106c3fb27SDimitry Andric /// \code{.operation} 464206c3fb27SDimitry Andric /// FOR element := 0 to 1 464306c3fb27SDimitry Andric /// j := element*64 464406c3fb27SDimitry Andric /// k := element*64 464506c3fb27SDimitry Andric /// IF mask[j+63] == 0 464606c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 464706c3fb27SDimitry Andric /// ELSE 464806c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 464906c3fb27SDimitry Andric /// FI 465006c3fb27SDimitry Andric /// ENDFOR 465106c3fb27SDimitry Andric /// \endcode 465206c3fb27SDimitry Andric /// 465306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 465406c3fb27SDimitry Andric /// 465506c3fb27SDimitry Andric /// \code 465606c3fb27SDimitry Andric /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i, 465706c3fb27SDimitry Andric /// __m128i mask, const int s); 465806c3fb27SDimitry Andric /// \endcode 465906c3fb27SDimitry Andric /// 466006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 466106c3fb27SDimitry Andric /// 466206c3fb27SDimitry Andric /// \param a 466306c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] used as the source when a mask bit is 466406c3fb27SDimitry Andric /// zero. 466506c3fb27SDimitry Andric /// \param m 466606c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 466706c3fb27SDimitry Andric /// \param i 466806c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 466906c3fb27SDimitry Andric /// \param mask 467006c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing the mask. The most significant 467106c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 467206c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 467306c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 467406c3fb27SDimitry Andric /// \param s 467506c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 467606c3fb27SDimitry Andric /// 1, 2, 4, or 8. 467706c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 46780b57cec5SDimitry Andric #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ 4679349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ 46800b57cec5SDimitry Andric (long long const *)(m), \ 46810b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 4682349cc55cSDimitry Andric (__v2di)(__m128i)(mask), (s))) 46830b57cec5SDimitry Andric 468406c3fb27SDimitry Andric /// Conditionally gathers four 64-bit integer values, either from the 468506c3fb27SDimitry Andric /// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled 468606c3fb27SDimitry Andric /// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector 468706c3fb27SDimitry Andric /// of [4 x i64] in \a mask determines the source for each element. 468806c3fb27SDimitry Andric /// 468906c3fb27SDimitry Andric /// \code{.operation} 469006c3fb27SDimitry Andric /// FOR element := 0 to 3 469106c3fb27SDimitry Andric /// j := element*64 469206c3fb27SDimitry Andric /// k := element*64 469306c3fb27SDimitry Andric /// IF mask[j+63] == 0 469406c3fb27SDimitry Andric /// result[j+63:j] := a[j+63:j] 469506c3fb27SDimitry Andric /// ELSE 469606c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 469706c3fb27SDimitry Andric /// FI 469806c3fb27SDimitry Andric /// ENDFOR 469906c3fb27SDimitry Andric /// \endcode 470006c3fb27SDimitry Andric /// 470106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 470206c3fb27SDimitry Andric /// 470306c3fb27SDimitry Andric /// \code 470406c3fb27SDimitry Andric /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m, 470506c3fb27SDimitry Andric /// __m256i i, __m256i mask, const int s); 470606c3fb27SDimitry Andric /// \endcode 470706c3fb27SDimitry Andric /// 470806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 470906c3fb27SDimitry Andric /// 471006c3fb27SDimitry Andric /// \param a 471106c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] used as the source when a mask bit is 471206c3fb27SDimitry Andric /// zero. 471306c3fb27SDimitry Andric /// \param m 471406c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 471506c3fb27SDimitry Andric /// \param i 471606c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 471706c3fb27SDimitry Andric /// \param mask 471806c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing the mask. The most significant 471906c3fb27SDimitry Andric /// bit of each element in the mask vector represents the mask bits. If a 472006c3fb27SDimitry Andric /// mask bit is zero, the corresponding value from vector \a a is gathered; 472106c3fb27SDimitry Andric /// otherwise the value is loaded from memory. 472206c3fb27SDimitry Andric /// \param s 472306c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 472406c3fb27SDimitry Andric /// 1, 2, 4, or 8. 472506c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 47260b57cec5SDimitry Andric #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ 4727349cc55cSDimitry Andric ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ 47280b57cec5SDimitry Andric (long long const *)(m), \ 47290b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 4730349cc55cSDimitry Andric (__v4di)(__m256i)(mask), (s))) 47310b57cec5SDimitry Andric 473206c3fb27SDimitry Andric /// Gathers two 64-bit floating-point values from memory \a m using scaled 473306c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. 473406c3fb27SDimitry Andric /// 473506c3fb27SDimitry Andric /// \code{.operation} 473606c3fb27SDimitry Andric /// FOR element := 0 to 1 473706c3fb27SDimitry Andric /// j := element*64 473806c3fb27SDimitry Andric /// k := element*32 473906c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 474006c3fb27SDimitry Andric /// ENDFOR 474106c3fb27SDimitry Andric /// \endcode 474206c3fb27SDimitry Andric /// 474306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 474406c3fb27SDimitry Andric /// 474506c3fb27SDimitry Andric /// \code 474606c3fb27SDimitry Andric /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s); 474706c3fb27SDimitry Andric /// \endcode 474806c3fb27SDimitry Andric /// 474906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction. 475006c3fb27SDimitry Andric /// 475106c3fb27SDimitry Andric /// \param m 475206c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 475306c3fb27SDimitry Andric /// \param i 475406c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 475506c3fb27SDimitry Andric /// the first two elements are used. 475606c3fb27SDimitry Andric /// \param s 475706c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 475806c3fb27SDimitry Andric /// 1, 2, 4, or 8. 475906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values. 47600b57cec5SDimitry Andric #define _mm_i32gather_pd(m, i, s) \ 4761349cc55cSDimitry Andric ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ 47620b57cec5SDimitry Andric (double const *)(m), \ 47630b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 47640b57cec5SDimitry Andric (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 47650b57cec5SDimitry Andric _mm_setzero_pd()), \ 4766349cc55cSDimitry Andric (s))) 47670b57cec5SDimitry Andric 476806c3fb27SDimitry Andric /// Gathers four 64-bit floating-point values from memory \a m using scaled 476906c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. 477006c3fb27SDimitry Andric /// 477106c3fb27SDimitry Andric /// \code{.operation} 477206c3fb27SDimitry Andric /// FOR element := 0 to 3 477306c3fb27SDimitry Andric /// j := element*64 477406c3fb27SDimitry Andric /// k := element*32 477506c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 477606c3fb27SDimitry Andric /// ENDFOR 477706c3fb27SDimitry Andric /// \endcode 477806c3fb27SDimitry Andric /// 477906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 478006c3fb27SDimitry Andric /// 478106c3fb27SDimitry Andric /// \code 478206c3fb27SDimitry Andric /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s); 478306c3fb27SDimitry Andric /// \endcode 478406c3fb27SDimitry Andric /// 478506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPD instruction. 478606c3fb27SDimitry Andric /// 478706c3fb27SDimitry Andric /// \param m 478806c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 478906c3fb27SDimitry Andric /// \param i 479006c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 479106c3fb27SDimitry Andric /// \param s 479206c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 479306c3fb27SDimitry Andric /// 1, 2, 4, or 8. 479406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values. 47950b57cec5SDimitry Andric #define _mm256_i32gather_pd(m, i, s) \ 4796349cc55cSDimitry Andric ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ 47970b57cec5SDimitry Andric (double const *)(m), \ 47980b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 47990b57cec5SDimitry Andric (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 48000b57cec5SDimitry Andric _mm256_setzero_pd(), \ 48010b57cec5SDimitry Andric _CMP_EQ_OQ), \ 4802349cc55cSDimitry Andric (s))) 48030b57cec5SDimitry Andric 480406c3fb27SDimitry Andric /// Gathers two 64-bit floating-point values from memory \a m using scaled 480506c3fb27SDimitry Andric /// indexes from the 128-bit vector of [2 x i64] in \a i. 480606c3fb27SDimitry Andric /// 480706c3fb27SDimitry Andric /// \code{.operation} 480806c3fb27SDimitry Andric /// FOR element := 0 to 1 480906c3fb27SDimitry Andric /// j := element*64 481006c3fb27SDimitry Andric /// k := element*64 481106c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 481206c3fb27SDimitry Andric /// ENDFOR 481306c3fb27SDimitry Andric /// \endcode 481406c3fb27SDimitry Andric /// 481506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 481606c3fb27SDimitry Andric /// 481706c3fb27SDimitry Andric /// \code 481806c3fb27SDimitry Andric /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s); 481906c3fb27SDimitry Andric /// \endcode 482006c3fb27SDimitry Andric /// 482106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction. 482206c3fb27SDimitry Andric /// 482306c3fb27SDimitry Andric /// \param m 482406c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 482506c3fb27SDimitry Andric /// \param i 482606c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 482706c3fb27SDimitry Andric /// \param s 482806c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 482906c3fb27SDimitry Andric /// 1, 2, 4, or 8. 483006c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the gathered values. 48310b57cec5SDimitry Andric #define _mm_i64gather_pd(m, i, s) \ 4832349cc55cSDimitry Andric ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ 48330b57cec5SDimitry Andric (double const *)(m), \ 48340b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 48350b57cec5SDimitry Andric (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ 48360b57cec5SDimitry Andric _mm_setzero_pd()), \ 4837349cc55cSDimitry Andric (s))) 48380b57cec5SDimitry Andric 483906c3fb27SDimitry Andric /// Gathers four 64-bit floating-point values from memory \a m using scaled 484006c3fb27SDimitry Andric /// indexes from the 256-bit vector of [4 x i64] in \a i. 484106c3fb27SDimitry Andric /// 484206c3fb27SDimitry Andric /// \code{.operation} 484306c3fb27SDimitry Andric /// FOR element := 0 to 3 484406c3fb27SDimitry Andric /// j := element*64 484506c3fb27SDimitry Andric /// k := element*64 484606c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 484706c3fb27SDimitry Andric /// ENDFOR 484806c3fb27SDimitry Andric /// \endcode 484906c3fb27SDimitry Andric /// 485006c3fb27SDimitry Andric /// \headerfile <immintrin.h> 485106c3fb27SDimitry Andric /// 485206c3fb27SDimitry Andric /// \code 485306c3fb27SDimitry Andric /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s); 485406c3fb27SDimitry Andric /// \endcode 485506c3fb27SDimitry Andric /// 485606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPD instruction. 485706c3fb27SDimitry Andric /// 485806c3fb27SDimitry Andric /// \param m 485906c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 486006c3fb27SDimitry Andric /// \param i 486106c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 486206c3fb27SDimitry Andric /// \param s 486306c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 486406c3fb27SDimitry Andric /// 1, 2, 4, or 8. 486506c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the gathered values. 48660b57cec5SDimitry Andric #define _mm256_i64gather_pd(m, i, s) \ 4867349cc55cSDimitry Andric ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ 48680b57cec5SDimitry Andric (double const *)(m), \ 48690b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 48700b57cec5SDimitry Andric (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ 48710b57cec5SDimitry Andric _mm256_setzero_pd(), \ 48720b57cec5SDimitry Andric _CMP_EQ_OQ), \ 4873349cc55cSDimitry Andric (s))) 48740b57cec5SDimitry Andric 487506c3fb27SDimitry Andric /// Gathers four 32-bit floating-point values from memory \a m using scaled 487606c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. 487706c3fb27SDimitry Andric /// 487806c3fb27SDimitry Andric /// \code{.operation} 487906c3fb27SDimitry Andric /// FOR element := 0 to 3 488006c3fb27SDimitry Andric /// j := element*32 488106c3fb27SDimitry Andric /// k := element*32 488206c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 488306c3fb27SDimitry Andric /// ENDFOR 488406c3fb27SDimitry Andric /// \endcode 488506c3fb27SDimitry Andric /// 488606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 488706c3fb27SDimitry Andric /// 488806c3fb27SDimitry Andric /// \code 488906c3fb27SDimitry Andric /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s); 489006c3fb27SDimitry Andric /// \endcode 489106c3fb27SDimitry Andric /// 489206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction. 489306c3fb27SDimitry Andric /// 489406c3fb27SDimitry Andric /// \param m 489506c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 489606c3fb27SDimitry Andric /// \param i 489706c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 489806c3fb27SDimitry Andric /// \param s 489906c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 490006c3fb27SDimitry Andric /// 1, 2, 4, or 8. 490106c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values. 49020b57cec5SDimitry Andric #define _mm_i32gather_ps(m, i, s) \ 4903349cc55cSDimitry Andric ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ 49040b57cec5SDimitry Andric (float const *)(m), \ 49050b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 49060b57cec5SDimitry Andric (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 49070b57cec5SDimitry Andric _mm_setzero_ps()), \ 4908349cc55cSDimitry Andric (s))) 49090b57cec5SDimitry Andric 491006c3fb27SDimitry Andric /// Gathers eight 32-bit floating-point values from memory \a m using scaled 491106c3fb27SDimitry Andric /// indexes from the 256-bit vector of [8 x i32] in \a i. 491206c3fb27SDimitry Andric /// 491306c3fb27SDimitry Andric /// \code{.operation} 491406c3fb27SDimitry Andric /// FOR element := 0 to 7 491506c3fb27SDimitry Andric /// j := element*32 491606c3fb27SDimitry Andric /// k := element*32 491706c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 491806c3fb27SDimitry Andric /// ENDFOR 491906c3fb27SDimitry Andric /// \endcode 492006c3fb27SDimitry Andric /// 492106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 492206c3fb27SDimitry Andric /// 492306c3fb27SDimitry Andric /// \code 492406c3fb27SDimitry Andric /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s); 492506c3fb27SDimitry Andric /// \endcode 492606c3fb27SDimitry Andric /// 492706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERDPS instruction. 492806c3fb27SDimitry Andric /// 492906c3fb27SDimitry Andric /// \param m 493006c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 493106c3fb27SDimitry Andric /// \param i 493206c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 493306c3fb27SDimitry Andric /// \param s 493406c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 493506c3fb27SDimitry Andric /// 1, 2, 4, or 8. 493606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the gathered values. 49370b57cec5SDimitry Andric #define _mm256_i32gather_ps(m, i, s) \ 4938349cc55cSDimitry Andric ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ 49390b57cec5SDimitry Andric (float const *)(m), \ 49400b57cec5SDimitry Andric (__v8si)(__m256i)(i), \ 49410b57cec5SDimitry Andric (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ 49420b57cec5SDimitry Andric _mm256_setzero_ps(), \ 49430b57cec5SDimitry Andric _CMP_EQ_OQ), \ 4944349cc55cSDimitry Andric (s))) 49450b57cec5SDimitry Andric 494606c3fb27SDimitry Andric /// Gathers two 32-bit floating-point values from memory \a m using scaled 494706c3fb27SDimitry Andric /// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two 494806c3fb27SDimitry Andric /// elements of the result are zeroed. 494906c3fb27SDimitry Andric /// 495006c3fb27SDimitry Andric /// \code{.operation} 495106c3fb27SDimitry Andric /// FOR element := 0 to 1 495206c3fb27SDimitry Andric /// j := element*32 495306c3fb27SDimitry Andric /// k := element*64 495406c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 495506c3fb27SDimitry Andric /// ENDFOR 495606c3fb27SDimitry Andric /// result[127:64] := 0 495706c3fb27SDimitry Andric /// \endcode 495806c3fb27SDimitry Andric /// 495906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 496006c3fb27SDimitry Andric /// 496106c3fb27SDimitry Andric /// \code 496206c3fb27SDimitry Andric /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s); 496306c3fb27SDimitry Andric /// \endcode 496406c3fb27SDimitry Andric /// 496506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction. 496606c3fb27SDimitry Andric /// 496706c3fb27SDimitry Andric /// \param m 496806c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 496906c3fb27SDimitry Andric /// \param i 497006c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 497106c3fb27SDimitry Andric /// \param s 497206c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 497306c3fb27SDimitry Andric /// 1, 2, 4, or 8. 497406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values. 49750b57cec5SDimitry Andric #define _mm_i64gather_ps(m, i, s) \ 4976349cc55cSDimitry Andric ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ 49770b57cec5SDimitry Andric (float const *)(m), \ 49780b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 49790b57cec5SDimitry Andric (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 49800b57cec5SDimitry Andric _mm_setzero_ps()), \ 4981349cc55cSDimitry Andric (s))) 49820b57cec5SDimitry Andric 498306c3fb27SDimitry Andric /// Gathers four 32-bit floating-point values from memory \a m using scaled 498406c3fb27SDimitry Andric /// indexes from the 256-bit vector of [4 x i64] in \a i. 498506c3fb27SDimitry Andric /// 498606c3fb27SDimitry Andric /// \code{.operation} 498706c3fb27SDimitry Andric /// FOR element := 0 to 3 498806c3fb27SDimitry Andric /// j := element*32 498906c3fb27SDimitry Andric /// k := element*64 499006c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s) 499106c3fb27SDimitry Andric /// ENDFOR 499206c3fb27SDimitry Andric /// \endcode 499306c3fb27SDimitry Andric /// 499406c3fb27SDimitry Andric /// \headerfile <immintrin.h> 499506c3fb27SDimitry Andric /// 499606c3fb27SDimitry Andric /// \code 499706c3fb27SDimitry Andric /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s); 499806c3fb27SDimitry Andric /// \endcode 499906c3fb27SDimitry Andric /// 500006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VGATHERQPS instruction. 500106c3fb27SDimitry Andric /// 500206c3fb27SDimitry Andric /// \param m 500306c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 500406c3fb27SDimitry Andric /// \param i 500506c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 500606c3fb27SDimitry Andric /// \param s 500706c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 500806c3fb27SDimitry Andric /// 1, 2, 4, or 8. 500906c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the gathered values. 50100b57cec5SDimitry Andric #define _mm256_i64gather_ps(m, i, s) \ 5011349cc55cSDimitry Andric ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ 50120b57cec5SDimitry Andric (float const *)(m), \ 50130b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 50140b57cec5SDimitry Andric (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ 50150b57cec5SDimitry Andric _mm_setzero_ps()), \ 5016349cc55cSDimitry Andric (s))) 50170b57cec5SDimitry Andric 501806c3fb27SDimitry Andric /// Gathers four 32-bit floating-point values from memory \a m using scaled 501906c3fb27SDimitry Andric /// indexes from the 128-bit vector of [4 x i32] in \a i. 502006c3fb27SDimitry Andric /// 502106c3fb27SDimitry Andric /// \code{.operation} 502206c3fb27SDimitry Andric /// FOR element := 0 to 3 502306c3fb27SDimitry Andric /// j := element*32 502406c3fb27SDimitry Andric /// k := element*32 502506c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 502606c3fb27SDimitry Andric /// ENDFOR 502706c3fb27SDimitry Andric /// \endcode 502806c3fb27SDimitry Andric /// 502906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 503006c3fb27SDimitry Andric /// 503106c3fb27SDimitry Andric /// \code 503206c3fb27SDimitry Andric /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s); 503306c3fb27SDimitry Andric /// \endcode 503406c3fb27SDimitry Andric /// 503506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction. 503606c3fb27SDimitry Andric /// 503706c3fb27SDimitry Andric /// \param m 503806c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 503906c3fb27SDimitry Andric /// \param i 504006c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 504106c3fb27SDimitry Andric /// \param s 504206c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 504306c3fb27SDimitry Andric /// 1, 2, 4, or 8. 504406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 50450b57cec5SDimitry Andric #define _mm_i32gather_epi32(m, i, s) \ 5046349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ 50470b57cec5SDimitry Andric (int const *)(m), (__v4si)(__m128i)(i), \ 5048349cc55cSDimitry Andric (__v4si)_mm_set1_epi32(-1), (s))) 50490b57cec5SDimitry Andric 505006c3fb27SDimitry Andric /// Gathers eight 32-bit floating-point values from memory \a m using scaled 505106c3fb27SDimitry Andric /// indexes from the 256-bit vector of [8 x i32] in \a i. 505206c3fb27SDimitry Andric /// 505306c3fb27SDimitry Andric /// \code{.operation} 505406c3fb27SDimitry Andric /// FOR element := 0 to 7 505506c3fb27SDimitry Andric /// j := element*32 505606c3fb27SDimitry Andric /// k := element*32 505706c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s) 505806c3fb27SDimitry Andric /// ENDFOR 505906c3fb27SDimitry Andric /// \endcode 506006c3fb27SDimitry Andric /// 506106c3fb27SDimitry Andric /// \headerfile <immintrin.h> 506206c3fb27SDimitry Andric /// 506306c3fb27SDimitry Andric /// \code 506406c3fb27SDimitry Andric /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s); 506506c3fb27SDimitry Andric /// \endcode 506606c3fb27SDimitry Andric /// 506706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDD instruction. 506806c3fb27SDimitry Andric /// 506906c3fb27SDimitry Andric /// \param m 507006c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 507106c3fb27SDimitry Andric /// \param i 507206c3fb27SDimitry Andric /// A 256-bit vector of [8 x i32] containing signed indexes into \a m. 507306c3fb27SDimitry Andric /// \param s 507406c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 507506c3fb27SDimitry Andric /// 1, 2, 4, or 8. 507606c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x i32] containing the gathered values. 50770b57cec5SDimitry Andric #define _mm256_i32gather_epi32(m, i, s) \ 5078349cc55cSDimitry Andric ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ 50790b57cec5SDimitry Andric (int const *)(m), (__v8si)(__m256i)(i), \ 5080349cc55cSDimitry Andric (__v8si)_mm256_set1_epi32(-1), (s))) 50810b57cec5SDimitry Andric 508206c3fb27SDimitry Andric /// Gathers two 32-bit integer values from memory \a m using scaled indexes 508306c3fb27SDimitry Andric /// from the 128-bit vector of [2 x i64] in \a i. The upper two elements 508406c3fb27SDimitry Andric /// of the result are zeroed. 508506c3fb27SDimitry Andric /// 508606c3fb27SDimitry Andric /// \code{.operation} 508706c3fb27SDimitry Andric /// FOR element := 0 to 1 508806c3fb27SDimitry Andric /// j := element*32 508906c3fb27SDimitry Andric /// k := element*64 509006c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 509106c3fb27SDimitry Andric /// ENDFOR 509206c3fb27SDimitry Andric /// result[127:64] := 0 509306c3fb27SDimitry Andric /// \endcode 509406c3fb27SDimitry Andric /// 509506c3fb27SDimitry Andric /// \headerfile <immintrin.h> 509606c3fb27SDimitry Andric /// 509706c3fb27SDimitry Andric /// \code 509806c3fb27SDimitry Andric /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s); 509906c3fb27SDimitry Andric /// \endcode 510006c3fb27SDimitry Andric /// 510106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction. 510206c3fb27SDimitry Andric /// 510306c3fb27SDimitry Andric /// \param m 510406c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 510506c3fb27SDimitry Andric /// \param i 510606c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 510706c3fb27SDimitry Andric /// \param s 510806c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 510906c3fb27SDimitry Andric /// 1, 2, 4, or 8. 511006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 51110b57cec5SDimitry Andric #define _mm_i64gather_epi32(m, i, s) \ 5112349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ 51130b57cec5SDimitry Andric (int const *)(m), (__v2di)(__m128i)(i), \ 5114349cc55cSDimitry Andric (__v4si)_mm_set1_epi32(-1), (s))) 51150b57cec5SDimitry Andric 511606c3fb27SDimitry Andric /// Gathers four 32-bit integer values from memory \a m using scaled indexes 511706c3fb27SDimitry Andric /// from the 256-bit vector of [4 x i64] in \a i. 511806c3fb27SDimitry Andric /// 511906c3fb27SDimitry Andric /// \code{.operation} 512006c3fb27SDimitry Andric /// FOR element := 0 to 3 512106c3fb27SDimitry Andric /// j := element*32 512206c3fb27SDimitry Andric /// k := element*64 512306c3fb27SDimitry Andric /// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s) 512406c3fb27SDimitry Andric /// ENDFOR 512506c3fb27SDimitry Andric /// \endcode 512606c3fb27SDimitry Andric /// 512706c3fb27SDimitry Andric /// \headerfile <immintrin.h> 512806c3fb27SDimitry Andric /// 512906c3fb27SDimitry Andric /// \code 513006c3fb27SDimitry Andric /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s); 513106c3fb27SDimitry Andric /// \endcode 513206c3fb27SDimitry Andric /// 513306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQD instruction. 513406c3fb27SDimitry Andric /// 513506c3fb27SDimitry Andric /// \param m 513606c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 513706c3fb27SDimitry Andric /// \param i 513806c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 513906c3fb27SDimitry Andric /// \param s 514006c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 514106c3fb27SDimitry Andric /// 1, 2, 4, or 8. 514206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x i32] containing the gathered values. 51430b57cec5SDimitry Andric #define _mm256_i64gather_epi32(m, i, s) \ 5144349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ 51450b57cec5SDimitry Andric (int const *)(m), (__v4di)(__m256i)(i), \ 5146349cc55cSDimitry Andric (__v4si)_mm_set1_epi32(-1), (s))) 51470b57cec5SDimitry Andric 514806c3fb27SDimitry Andric /// Gathers two 64-bit integer values from memory \a m using scaled indexes 514906c3fb27SDimitry Andric /// from the 128-bit vector of [4 x i32] in \a i. 515006c3fb27SDimitry Andric /// 515106c3fb27SDimitry Andric /// \code{.operation} 515206c3fb27SDimitry Andric /// FOR element := 0 to 1 515306c3fb27SDimitry Andric /// j := element*64 515406c3fb27SDimitry Andric /// k := element*32 515506c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 515606c3fb27SDimitry Andric /// ENDFOR 515706c3fb27SDimitry Andric /// \endcode 515806c3fb27SDimitry Andric /// 515906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 516006c3fb27SDimitry Andric /// 516106c3fb27SDimitry Andric /// \code 516206c3fb27SDimitry Andric /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s); 516306c3fb27SDimitry Andric /// \endcode 516406c3fb27SDimitry Andric /// 516506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 516606c3fb27SDimitry Andric /// 516706c3fb27SDimitry Andric /// \param m 516806c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 516906c3fb27SDimitry Andric /// \param i 517006c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only 517106c3fb27SDimitry Andric /// the first two elements are used. 517206c3fb27SDimitry Andric /// \param s 517306c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 517406c3fb27SDimitry Andric /// 1, 2, 4, or 8. 517506c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 51760b57cec5SDimitry Andric #define _mm_i32gather_epi64(m, i, s) \ 5177349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ 51780b57cec5SDimitry Andric (long long const *)(m), \ 51790b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 5180349cc55cSDimitry Andric (__v2di)_mm_set1_epi64x(-1), (s))) 51810b57cec5SDimitry Andric 518206c3fb27SDimitry Andric /// Gathers four 64-bit integer values from memory \a m using scaled indexes 518306c3fb27SDimitry Andric /// from the 128-bit vector of [4 x i32] in \a i. 518406c3fb27SDimitry Andric /// 518506c3fb27SDimitry Andric /// \code{.operation} 518606c3fb27SDimitry Andric /// FOR element := 0 to 3 518706c3fb27SDimitry Andric /// j := element*64 518806c3fb27SDimitry Andric /// k := element*32 518906c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s) 519006c3fb27SDimitry Andric /// ENDFOR 519106c3fb27SDimitry Andric /// \endcode 519206c3fb27SDimitry Andric /// 519306c3fb27SDimitry Andric /// \headerfile <immintrin.h> 519406c3fb27SDimitry Andric /// 519506c3fb27SDimitry Andric /// \code 519606c3fb27SDimitry Andric /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s); 519706c3fb27SDimitry Andric /// \endcode 519806c3fb27SDimitry Andric /// 519906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERDQ instruction. 520006c3fb27SDimitry Andric /// 520106c3fb27SDimitry Andric /// \param m 520206c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 520306c3fb27SDimitry Andric /// \param i 520406c3fb27SDimitry Andric /// A 128-bit vector of [4 x i32] containing signed indexes into \a m. 520506c3fb27SDimitry Andric /// \param s 520606c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 520706c3fb27SDimitry Andric /// 1, 2, 4, or 8. 520806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 52090b57cec5SDimitry Andric #define _mm256_i32gather_epi64(m, i, s) \ 5210349cc55cSDimitry Andric ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ 52110b57cec5SDimitry Andric (long long const *)(m), \ 52120b57cec5SDimitry Andric (__v4si)(__m128i)(i), \ 5213349cc55cSDimitry Andric (__v4di)_mm256_set1_epi64x(-1), (s))) 52140b57cec5SDimitry Andric 521506c3fb27SDimitry Andric /// Gathers two 64-bit integer values from memory \a m using scaled indexes 521606c3fb27SDimitry Andric /// from the 128-bit vector of [2 x i64] in \a i. 521706c3fb27SDimitry Andric /// 521806c3fb27SDimitry Andric /// \code{.operation} 521906c3fb27SDimitry Andric /// FOR element := 0 to 1 522006c3fb27SDimitry Andric /// j := element*64 522106c3fb27SDimitry Andric /// k := element*64 522206c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 522306c3fb27SDimitry Andric /// ENDFOR 522406c3fb27SDimitry Andric /// \endcode 522506c3fb27SDimitry Andric /// 522606c3fb27SDimitry Andric /// \headerfile <immintrin.h> 522706c3fb27SDimitry Andric /// 522806c3fb27SDimitry Andric /// \code 522906c3fb27SDimitry Andric /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s); 523006c3fb27SDimitry Andric /// \endcode 523106c3fb27SDimitry Andric /// 523206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 523306c3fb27SDimitry Andric /// 523406c3fb27SDimitry Andric /// \param m 523506c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 523606c3fb27SDimitry Andric /// \param i 523706c3fb27SDimitry Andric /// A 128-bit vector of [2 x i64] containing signed indexes into \a m. 523806c3fb27SDimitry Andric /// \param s 523906c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 524006c3fb27SDimitry Andric /// 1, 2, 4, or 8. 524106c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x i64] containing the gathered values. 52420b57cec5SDimitry Andric #define _mm_i64gather_epi64(m, i, s) \ 5243349cc55cSDimitry Andric ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ 52440b57cec5SDimitry Andric (long long const *)(m), \ 52450b57cec5SDimitry Andric (__v2di)(__m128i)(i), \ 5246349cc55cSDimitry Andric (__v2di)_mm_set1_epi64x(-1), (s))) 52470b57cec5SDimitry Andric 524806c3fb27SDimitry Andric /// Gathers four 64-bit integer values from memory \a m using scaled indexes 524906c3fb27SDimitry Andric /// from the 256-bit vector of [4 x i64] in \a i. 525006c3fb27SDimitry Andric /// 525106c3fb27SDimitry Andric /// \code{.operation} 525206c3fb27SDimitry Andric /// FOR element := 0 to 3 525306c3fb27SDimitry Andric /// j := element*64 525406c3fb27SDimitry Andric /// k := element*64 525506c3fb27SDimitry Andric /// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s) 525606c3fb27SDimitry Andric /// ENDFOR 525706c3fb27SDimitry Andric /// \endcode 525806c3fb27SDimitry Andric /// 525906c3fb27SDimitry Andric /// \headerfile <immintrin.h> 526006c3fb27SDimitry Andric /// 526106c3fb27SDimitry Andric /// \code 526206c3fb27SDimitry Andric /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s); 526306c3fb27SDimitry Andric /// \endcode 526406c3fb27SDimitry Andric /// 526506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPGATHERQQ instruction. 526606c3fb27SDimitry Andric /// 526706c3fb27SDimitry Andric /// \param m 526806c3fb27SDimitry Andric /// A pointer to the memory used for loading values. 526906c3fb27SDimitry Andric /// \param i 527006c3fb27SDimitry Andric /// A 256-bit vector of [4 x i64] containing signed indexes into \a m. 527106c3fb27SDimitry Andric /// \param s 527206c3fb27SDimitry Andric /// A literal constant scale factor for the indexes in \a i. Must be 527306c3fb27SDimitry Andric /// 1, 2, 4, or 8. 527406c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x i64] containing the gathered values. 52750b57cec5SDimitry Andric #define _mm256_i64gather_epi64(m, i, s) \ 5276349cc55cSDimitry Andric ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ 52770b57cec5SDimitry Andric (long long const *)(m), \ 52780b57cec5SDimitry Andric (__v4di)(__m256i)(i), \ 5279349cc55cSDimitry Andric (__v4di)_mm256_set1_epi64x(-1), (s))) 52800b57cec5SDimitry Andric 52810b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256 52820b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128 52830b57cec5SDimitry Andric 52840b57cec5SDimitry Andric #endif /* __AVX2INTRIN_H */ 5285