10b57cec5SDimitry Andric /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== 20b57cec5SDimitry Andric * 30b57cec5SDimitry Andric * 40b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 50b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 60b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 70b57cec5SDimitry Andric * 80b57cec5SDimitry Andric *===-----------------------------------------------------------------------=== 90b57cec5SDimitry Andric */ 100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H 110b57cec5SDimitry Andric #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead." 120b57cec5SDimitry Andric #endif 130b57cec5SDimitry Andric 140b57cec5SDimitry Andric #ifndef __AVX512VLVNNIINTRIN_H 150b57cec5SDimitry Andric #define __AVX512VLVNNIINTRIN_H 160b57cec5SDimitry Andric 170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */ 18*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS128 \ 19*5f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, \ 20*5f757f3fSDimitry Andric __target__("avx512vl,avx512vnni,no-evex512"), \ 21*5f757f3fSDimitry Andric __min_vector_width__(128))) 22*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS256 \ 23*5f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, \ 24*5f757f3fSDimitry Andric __target__("avx512vl,avx512vnni,no-evex512"), \ 25*5f757f3fSDimitry Andric __min_vector_width__(256))) 260b57cec5SDimitry Andric 27e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 28e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 29e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 30e8d8bef9SDimitry Andric /// in \a S, and store the packed 32-bit results in DST. 31e8d8bef9SDimitry Andric /// 32e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 33e8d8bef9SDimitry Andric /// 3481ad6265SDimitry Andric /// \code{.operation} 35e8d8bef9SDimitry Andric /// FOR j := 0 to 7 36e8d8bef9SDimitry Andric /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 37e8d8bef9SDimitry Andric /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 38e8d8bef9SDimitry Andric /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 39e8d8bef9SDimitry Andric /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 40e8d8bef9SDimitry Andric /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 41e8d8bef9SDimitry Andric /// ENDFOR 42e8d8bef9SDimitry Andric /// DST[MAX:256] := 0 4381ad6265SDimitry Andric /// \endcode 44e8d8bef9SDimitry Andric #define _mm256_dpbusd_epi32(S, A, B) \ 45349cc55cSDimitry Andric ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 460b57cec5SDimitry Andric 47e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 48e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 49e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 50e8d8bef9SDimitry Andric /// in \a S using signed saturation, and store the packed 32-bit results in DST. 51e8d8bef9SDimitry Andric /// 52e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 53e8d8bef9SDimitry Andric /// 5481ad6265SDimitry Andric /// \code{.operation} 55e8d8bef9SDimitry Andric /// FOR j := 0 to 7 56e8d8bef9SDimitry Andric /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 57e8d8bef9SDimitry Andric /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 58e8d8bef9SDimitry Andric /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 59e8d8bef9SDimitry Andric /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 60e8d8bef9SDimitry Andric /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 61e8d8bef9SDimitry Andric /// ENDFOR 62e8d8bef9SDimitry Andric /// DST[MAX:256] := 0 6381ad6265SDimitry Andric /// \endcode 64e8d8bef9SDimitry Andric #define _mm256_dpbusds_epi32(S, A, B) \ 65349cc55cSDimitry Andric ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 66e8d8bef9SDimitry Andric 67e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 68e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 69e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 70e8d8bef9SDimitry Andric /// and store the packed 32-bit results in DST. 71e8d8bef9SDimitry Andric /// 72e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 73e8d8bef9SDimitry Andric /// 7481ad6265SDimitry Andric /// \code{.operation} 75e8d8bef9SDimitry Andric /// FOR j := 0 to 7 76e8d8bef9SDimitry Andric /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 77e8d8bef9SDimitry Andric /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 78e8d8bef9SDimitry Andric /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 79e8d8bef9SDimitry Andric /// ENDFOR 80e8d8bef9SDimitry Andric /// DST[MAX:256] := 0 8181ad6265SDimitry Andric /// \endcode 82e8d8bef9SDimitry Andric #define _mm256_dpwssd_epi32(S, A, B) \ 83349cc55cSDimitry Andric ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 84e8d8bef9SDimitry Andric 85e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 86e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 87e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S 88e8d8bef9SDimitry Andric /// using signed saturation, and store the packed 32-bit results in DST. 89e8d8bef9SDimitry Andric /// 90e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 91e8d8bef9SDimitry Andric /// 9281ad6265SDimitry Andric /// \code{.operation} 93e8d8bef9SDimitry Andric /// FOR j := 0 to 7 94e8d8bef9SDimitry Andric /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 95e8d8bef9SDimitry Andric /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 96e8d8bef9SDimitry Andric /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 97e8d8bef9SDimitry Andric /// ENDFOR 98e8d8bef9SDimitry Andric /// DST[MAX:256] := 0 9981ad6265SDimitry Andric /// \endcode 100e8d8bef9SDimitry Andric #define _mm256_dpwssds_epi32(S, A, B) \ 101349cc55cSDimitry Andric ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 102e8d8bef9SDimitry Andric 103e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 104e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 105e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 106e8d8bef9SDimitry Andric /// in \a S, and store the packed 32-bit results in DST. 107e8d8bef9SDimitry Andric /// 108e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 109e8d8bef9SDimitry Andric /// 11081ad6265SDimitry Andric /// \code{.operation} 111e8d8bef9SDimitry Andric /// FOR j := 0 to 3 112e8d8bef9SDimitry Andric /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 113e8d8bef9SDimitry Andric /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 114e8d8bef9SDimitry Andric /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 115e8d8bef9SDimitry Andric /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 116e8d8bef9SDimitry Andric /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 117e8d8bef9SDimitry Andric /// ENDFOR 118e8d8bef9SDimitry Andric /// DST[MAX:128] := 0 11981ad6265SDimitry Andric /// \endcode 120e8d8bef9SDimitry Andric #define _mm_dpbusd_epi32(S, A, B) \ 121349cc55cSDimitry Andric ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 122e8d8bef9SDimitry Andric 123e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 124e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 125e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 126e8d8bef9SDimitry Andric /// in \a S using signed saturation, and store the packed 32-bit results in DST. 127e8d8bef9SDimitry Andric /// 128e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 129e8d8bef9SDimitry Andric /// 13081ad6265SDimitry Andric /// \code{.operation} 131e8d8bef9SDimitry Andric /// FOR j := 0 to 3 132e8d8bef9SDimitry Andric /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 133e8d8bef9SDimitry Andric /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 134e8d8bef9SDimitry Andric /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 135e8d8bef9SDimitry Andric /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 136e8d8bef9SDimitry Andric /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 137e8d8bef9SDimitry Andric /// ENDFOR 138e8d8bef9SDimitry Andric /// DST[MAX:128] := 0 13981ad6265SDimitry Andric /// \endcode 140e8d8bef9SDimitry Andric #define _mm_dpbusds_epi32(S, A, B) \ 141349cc55cSDimitry Andric ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 142e8d8bef9SDimitry Andric 143e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 144e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 145e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 146e8d8bef9SDimitry Andric /// and store the packed 32-bit results in DST. 147e8d8bef9SDimitry Andric /// 148e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 149e8d8bef9SDimitry Andric /// 15081ad6265SDimitry Andric /// \code{.operation} 151e8d8bef9SDimitry Andric /// FOR j := 0 to 3 152e8d8bef9SDimitry Andric /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 153e8d8bef9SDimitry Andric /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 154e8d8bef9SDimitry Andric /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 155e8d8bef9SDimitry Andric /// ENDFOR 156e8d8bef9SDimitry Andric /// DST[MAX:128] := 0 15781ad6265SDimitry Andric /// \endcode 158e8d8bef9SDimitry Andric #define _mm_dpwssd_epi32(S, A, B) \ 159349cc55cSDimitry Andric ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 160e8d8bef9SDimitry Andric 161e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 162e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 163e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S 164e8d8bef9SDimitry Andric /// using signed saturation, and store the packed 32-bit results in DST. 165e8d8bef9SDimitry Andric /// 166e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 167e8d8bef9SDimitry Andric /// 16881ad6265SDimitry Andric /// \code{.operation} 169e8d8bef9SDimitry Andric /// FOR j := 0 to 3 170e8d8bef9SDimitry Andric /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 171e8d8bef9SDimitry Andric /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 172e8d8bef9SDimitry Andric /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 173e8d8bef9SDimitry Andric /// ENDFOR 174e8d8bef9SDimitry Andric /// DST[MAX:128] := 0 17581ad6265SDimitry Andric /// \endcode 176e8d8bef9SDimitry Andric #define _mm_dpwssds_epi32(S, A, B) \ 177349cc55cSDimitry Andric ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 1780b57cec5SDimitry Andric 1790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1800b57cec5SDimitry Andric _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 1810b57cec5SDimitry Andric { 1820b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 1830b57cec5SDimitry Andric (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 1840b57cec5SDimitry Andric (__v8si)__S); 1850b57cec5SDimitry Andric } 1860b57cec5SDimitry Andric 1870b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1880b57cec5SDimitry Andric _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 1890b57cec5SDimitry Andric { 1900b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 1910b57cec5SDimitry Andric (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 1920b57cec5SDimitry Andric (__v8si)_mm256_setzero_si256()); 1930b57cec5SDimitry Andric } 1940b57cec5SDimitry Andric 1950b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 1960b57cec5SDimitry Andric _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 1970b57cec5SDimitry Andric { 1980b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 1990b57cec5SDimitry Andric (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 2000b57cec5SDimitry Andric (__v8si)__S); 2010b57cec5SDimitry Andric } 2020b57cec5SDimitry Andric 2030b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2040b57cec5SDimitry Andric _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 2050b57cec5SDimitry Andric { 2060b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 2070b57cec5SDimitry Andric (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 2080b57cec5SDimitry Andric (__v8si)_mm256_setzero_si256()); 2090b57cec5SDimitry Andric } 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2120b57cec5SDimitry Andric _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 2130b57cec5SDimitry Andric { 2140b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 2150b57cec5SDimitry Andric (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 2160b57cec5SDimitry Andric (__v8si)__S); 2170b57cec5SDimitry Andric } 2180b57cec5SDimitry Andric 2190b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2200b57cec5SDimitry Andric _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 2210b57cec5SDimitry Andric { 2220b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 2230b57cec5SDimitry Andric (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 2240b57cec5SDimitry Andric (__v8si)_mm256_setzero_si256()); 2250b57cec5SDimitry Andric } 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2280b57cec5SDimitry Andric _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 2290b57cec5SDimitry Andric { 2300b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 2310b57cec5SDimitry Andric (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 2320b57cec5SDimitry Andric (__v8si)__S); 2330b57cec5SDimitry Andric } 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256 2360b57cec5SDimitry Andric _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 2370b57cec5SDimitry Andric { 2380b57cec5SDimitry Andric return (__m256i)__builtin_ia32_selectd_256(__U, 2390b57cec5SDimitry Andric (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 2400b57cec5SDimitry Andric (__v8si)_mm256_setzero_si256()); 2410b57cec5SDimitry Andric } 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2440b57cec5SDimitry Andric _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 2450b57cec5SDimitry Andric { 2460b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2470b57cec5SDimitry Andric (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 2480b57cec5SDimitry Andric (__v4si)__S); 2490b57cec5SDimitry Andric } 2500b57cec5SDimitry Andric 2510b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2520b57cec5SDimitry Andric _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 2530b57cec5SDimitry Andric { 2540b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2550b57cec5SDimitry Andric (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 2560b57cec5SDimitry Andric (__v4si)_mm_setzero_si128()); 2570b57cec5SDimitry Andric } 2580b57cec5SDimitry Andric 2590b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2600b57cec5SDimitry Andric _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 2610b57cec5SDimitry Andric { 2620b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2630b57cec5SDimitry Andric (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 2640b57cec5SDimitry Andric (__v4si)__S); 2650b57cec5SDimitry Andric } 2660b57cec5SDimitry Andric 2670b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2680b57cec5SDimitry Andric _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 2690b57cec5SDimitry Andric { 2700b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2710b57cec5SDimitry Andric (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 2720b57cec5SDimitry Andric (__v4si)_mm_setzero_si128()); 2730b57cec5SDimitry Andric } 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2760b57cec5SDimitry Andric _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 2770b57cec5SDimitry Andric { 2780b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2790b57cec5SDimitry Andric (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 2800b57cec5SDimitry Andric (__v4si)__S); 2810b57cec5SDimitry Andric } 2820b57cec5SDimitry Andric 2830b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2840b57cec5SDimitry Andric _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 2850b57cec5SDimitry Andric { 2860b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2870b57cec5SDimitry Andric (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 2880b57cec5SDimitry Andric (__v4si)_mm_setzero_si128()); 2890b57cec5SDimitry Andric } 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 2920b57cec5SDimitry Andric _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 2930b57cec5SDimitry Andric { 2940b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 2950b57cec5SDimitry Andric (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 2960b57cec5SDimitry Andric (__v4si)__S); 2970b57cec5SDimitry Andric } 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 3000b57cec5SDimitry Andric _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 3010b57cec5SDimitry Andric { 3020b57cec5SDimitry Andric return (__m128i)__builtin_ia32_selectd_128(__U, 3030b57cec5SDimitry Andric (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 3040b57cec5SDimitry Andric (__v4si)_mm_setzero_si128()); 3050b57cec5SDimitry Andric } 3060b57cec5SDimitry Andric 3070b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128 3080b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256 3090b57cec5SDimitry Andric 3100b57cec5SDimitry Andric #endif 311