xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512vlvnniintrin.h (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
10b57cec5SDimitry Andric /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  *
40b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
50b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
60b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
70b57cec5SDimitry Andric  *
80b57cec5SDimitry Andric  *===-----------------------------------------------------------------------===
90b57cec5SDimitry Andric  */
100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H
110b57cec5SDimitry Andric #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
120b57cec5SDimitry Andric #endif
130b57cec5SDimitry Andric 
140b57cec5SDimitry Andric #ifndef __AVX512VLVNNIINTRIN_H
150b57cec5SDimitry Andric #define __AVX512VLVNNIINTRIN_H
160b57cec5SDimitry Andric 
170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */
18*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS128                                                  \
19*5f757f3fSDimitry Andric   __attribute__((__always_inline__, __nodebug__,                               \
20*5f757f3fSDimitry Andric                  __target__("avx512vl,avx512vnni,no-evex512"),                 \
21*5f757f3fSDimitry Andric                  __min_vector_width__(128)))
22*5f757f3fSDimitry Andric #define __DEFAULT_FN_ATTRS256                                                  \
23*5f757f3fSDimitry Andric   __attribute__((__always_inline__, __nodebug__,                               \
24*5f757f3fSDimitry Andric                  __target__("avx512vl,avx512vnni,no-evex512"),                 \
25*5f757f3fSDimitry Andric                  __min_vector_width__(256)))
260b57cec5SDimitry Andric 
27e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
28e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
29e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
30e8d8bef9SDimitry Andric /// in \a S, and store the packed 32-bit results in DST.
31e8d8bef9SDimitry Andric ///
32e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
33e8d8bef9SDimitry Andric ///
3481ad6265SDimitry Andric /// \code{.operation}
35e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
36e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
37e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
38e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
39e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
40e8d8bef9SDimitry Andric ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
41e8d8bef9SDimitry Andric ///    ENDFOR
42e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
4381ad6265SDimitry Andric /// \endcode
44e8d8bef9SDimitry Andric #define _mm256_dpbusd_epi32(S, A, B) \
45349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
460b57cec5SDimitry Andric 
47e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
48e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
49e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
50e8d8bef9SDimitry Andric /// in \a S using signed saturation, and store the packed 32-bit results in DST.
51e8d8bef9SDimitry Andric ///
52e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
53e8d8bef9SDimitry Andric ///
5481ad6265SDimitry Andric /// \code{.operation}
55e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
56e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
57e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
58e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
59e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
60e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
61e8d8bef9SDimitry Andric ///    ENDFOR
62e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
6381ad6265SDimitry Andric /// \endcode
64e8d8bef9SDimitry Andric #define _mm256_dpbusds_epi32(S, A, B) \
65349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
66e8d8bef9SDimitry Andric 
67e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
68e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
69e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
70e8d8bef9SDimitry Andric ///  and store the packed 32-bit results in DST.
71e8d8bef9SDimitry Andric ///
72e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
73e8d8bef9SDimitry Andric ///
7481ad6265SDimitry Andric /// \code{.operation}
75e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
76e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
77e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
78e8d8bef9SDimitry Andric ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
79e8d8bef9SDimitry Andric ///    ENDFOR
80e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
8181ad6265SDimitry Andric /// \endcode
82e8d8bef9SDimitry Andric #define _mm256_dpwssd_epi32(S, A, B) \
83349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
84e8d8bef9SDimitry Andric 
85e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
86e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
87e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
88e8d8bef9SDimitry Andric /// using signed saturation, and store the packed 32-bit results in DST.
89e8d8bef9SDimitry Andric ///
90e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
91e8d8bef9SDimitry Andric ///
9281ad6265SDimitry Andric /// \code{.operation}
93e8d8bef9SDimitry Andric ///    FOR j := 0 to 7
94e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
95e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
96e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
97e8d8bef9SDimitry Andric ///    ENDFOR
98e8d8bef9SDimitry Andric ///    DST[MAX:256] := 0
9981ad6265SDimitry Andric /// \endcode
100e8d8bef9SDimitry Andric #define _mm256_dpwssds_epi32(S, A, B) \
101349cc55cSDimitry Andric   ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
102e8d8bef9SDimitry Andric 
103e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
104e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
105e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
106e8d8bef9SDimitry Andric /// in \a S, and store the packed 32-bit results in DST.
107e8d8bef9SDimitry Andric ///
108e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
109e8d8bef9SDimitry Andric ///
11081ad6265SDimitry Andric /// \code{.operation}
111e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
112e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
113e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
114e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
115e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
116e8d8bef9SDimitry Andric ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
117e8d8bef9SDimitry Andric ///    ENDFOR
118e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
11981ad6265SDimitry Andric /// \endcode
120e8d8bef9SDimitry Andric #define _mm_dpbusd_epi32(S, A, B) \
121349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
122e8d8bef9SDimitry Andric 
123e8d8bef9SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
124e8d8bef9SDimitry Andric /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
125e8d8bef9SDimitry Andric /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
126e8d8bef9SDimitry Andric /// in \a S using signed saturation, and store the packed 32-bit results in DST.
127e8d8bef9SDimitry Andric ///
128e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
129e8d8bef9SDimitry Andric ///
13081ad6265SDimitry Andric /// \code{.operation}
131e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
132e8d8bef9SDimitry Andric ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
133e8d8bef9SDimitry Andric ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
134e8d8bef9SDimitry Andric ///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
135e8d8bef9SDimitry Andric ///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
136e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
137e8d8bef9SDimitry Andric ///    ENDFOR
138e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
13981ad6265SDimitry Andric /// \endcode
140e8d8bef9SDimitry Andric #define _mm_dpbusds_epi32(S, A, B) \
141349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
142e8d8bef9SDimitry Andric 
143e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
144e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
145e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
146e8d8bef9SDimitry Andric /// and store the packed 32-bit results in DST.
147e8d8bef9SDimitry Andric ///
148e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
149e8d8bef9SDimitry Andric ///
15081ad6265SDimitry Andric /// \code{.operation}
151e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
152e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
153e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
154e8d8bef9SDimitry Andric ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
155e8d8bef9SDimitry Andric ///    ENDFOR
156e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
15781ad6265SDimitry Andric /// \endcode
158e8d8bef9SDimitry Andric #define _mm_dpwssd_epi32(S, A, B) \
159349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
160e8d8bef9SDimitry Andric 
161e8d8bef9SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
162e8d8bef9SDimitry Andric /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
163e8d8bef9SDimitry Andric /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
164e8d8bef9SDimitry Andric /// using signed saturation, and store the packed 32-bit results in DST.
165e8d8bef9SDimitry Andric ///
166e8d8bef9SDimitry Andric /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
167e8d8bef9SDimitry Andric ///
16881ad6265SDimitry Andric /// \code{.operation}
169e8d8bef9SDimitry Andric ///    FOR j := 0 to 3
170e8d8bef9SDimitry Andric ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
171e8d8bef9SDimitry Andric ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
172e8d8bef9SDimitry Andric ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
173e8d8bef9SDimitry Andric ///    ENDFOR
174e8d8bef9SDimitry Andric ///    DST[MAX:128] := 0
17581ad6265SDimitry Andric /// \endcode
176e8d8bef9SDimitry Andric #define _mm_dpwssds_epi32(S, A, B) \
177349cc55cSDimitry Andric   ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
1780b57cec5SDimitry Andric 
1790b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1800b57cec5SDimitry Andric _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
1810b57cec5SDimitry Andric {
1820b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
1830b57cec5SDimitry Andric                                      (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
1840b57cec5SDimitry Andric                                      (__v8si)__S);
1850b57cec5SDimitry Andric }
1860b57cec5SDimitry Andric 
1870b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1880b57cec5SDimitry Andric _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
1890b57cec5SDimitry Andric {
1900b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
1910b57cec5SDimitry Andric                                      (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
1920b57cec5SDimitry Andric                                      (__v8si)_mm256_setzero_si256());
1930b57cec5SDimitry Andric }
1940b57cec5SDimitry Andric 
1950b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
1960b57cec5SDimitry Andric _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
1970b57cec5SDimitry Andric {
1980b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
1990b57cec5SDimitry Andric                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
2000b57cec5SDimitry Andric                                     (__v8si)__S);
2010b57cec5SDimitry Andric }
2020b57cec5SDimitry Andric 
2030b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2040b57cec5SDimitry Andric _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
2050b57cec5SDimitry Andric {
2060b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
2070b57cec5SDimitry Andric                                      (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
2080b57cec5SDimitry Andric                                      (__v8si)_mm256_setzero_si256());
2090b57cec5SDimitry Andric }
2100b57cec5SDimitry Andric 
2110b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2120b57cec5SDimitry Andric _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
2130b57cec5SDimitry Andric {
2140b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
2150b57cec5SDimitry Andric                                      (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
2160b57cec5SDimitry Andric                                      (__v8si)__S);
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2200b57cec5SDimitry Andric _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
2210b57cec5SDimitry Andric {
2220b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
2230b57cec5SDimitry Andric                                      (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
2240b57cec5SDimitry Andric                                      (__v8si)_mm256_setzero_si256());
2250b57cec5SDimitry Andric }
2260b57cec5SDimitry Andric 
2270b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2280b57cec5SDimitry Andric _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
2290b57cec5SDimitry Andric {
2300b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
2310b57cec5SDimitry Andric                                     (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
2320b57cec5SDimitry Andric                                     (__v8si)__S);
2330b57cec5SDimitry Andric }
2340b57cec5SDimitry Andric 
2350b57cec5SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
2360b57cec5SDimitry Andric _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
2370b57cec5SDimitry Andric {
2380b57cec5SDimitry Andric   return (__m256i)__builtin_ia32_selectd_256(__U,
2390b57cec5SDimitry Andric                                     (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
2400b57cec5SDimitry Andric                                     (__v8si)_mm256_setzero_si256());
2410b57cec5SDimitry Andric }
2420b57cec5SDimitry Andric 
2430b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2440b57cec5SDimitry Andric _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
2450b57cec5SDimitry Andric {
2460b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2470b57cec5SDimitry Andric                                         (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
2480b57cec5SDimitry Andric                                         (__v4si)__S);
2490b57cec5SDimitry Andric }
2500b57cec5SDimitry Andric 
2510b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2520b57cec5SDimitry Andric _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
2530b57cec5SDimitry Andric {
2540b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2550b57cec5SDimitry Andric                                         (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
2560b57cec5SDimitry Andric                                         (__v4si)_mm_setzero_si128());
2570b57cec5SDimitry Andric }
2580b57cec5SDimitry Andric 
2590b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2600b57cec5SDimitry Andric _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
2610b57cec5SDimitry Andric {
2620b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2630b57cec5SDimitry Andric                                        (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
2640b57cec5SDimitry Andric                                        (__v4si)__S);
2650b57cec5SDimitry Andric }
2660b57cec5SDimitry Andric 
2670b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2680b57cec5SDimitry Andric _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
2690b57cec5SDimitry Andric {
2700b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2710b57cec5SDimitry Andric                                        (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
2720b57cec5SDimitry Andric                                        (__v4si)_mm_setzero_si128());
2730b57cec5SDimitry Andric }
2740b57cec5SDimitry Andric 
2750b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2760b57cec5SDimitry Andric _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
2770b57cec5SDimitry Andric {
2780b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2790b57cec5SDimitry Andric                                         (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
2800b57cec5SDimitry Andric                                         (__v4si)__S);
2810b57cec5SDimitry Andric }
2820b57cec5SDimitry Andric 
2830b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2840b57cec5SDimitry Andric _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
2850b57cec5SDimitry Andric {
2860b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2870b57cec5SDimitry Andric                                         (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
2880b57cec5SDimitry Andric                                         (__v4si)_mm_setzero_si128());
2890b57cec5SDimitry Andric }
2900b57cec5SDimitry Andric 
2910b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
2920b57cec5SDimitry Andric _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
2930b57cec5SDimitry Andric {
2940b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
2950b57cec5SDimitry Andric                                        (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
2960b57cec5SDimitry Andric                                        (__v4si)__S);
2970b57cec5SDimitry Andric }
2980b57cec5SDimitry Andric 
2990b57cec5SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128
3000b57cec5SDimitry Andric _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
3010b57cec5SDimitry Andric {
3020b57cec5SDimitry Andric   return (__m128i)__builtin_ia32_selectd_128(__U,
3030b57cec5SDimitry Andric                                        (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
3040b57cec5SDimitry Andric                                        (__v4si)_mm_setzero_si128());
3050b57cec5SDimitry Andric }
3060b57cec5SDimitry Andric 
3070b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128
3080b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256
3090b57cec5SDimitry Andric 
3100b57cec5SDimitry Andric #endif
311