1*06c3fb27SDimitry Andric /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2*06c3fb27SDimitry Andric *
3*06c3fb27SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*06c3fb27SDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
5*06c3fb27SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*06c3fb27SDimitry Andric *
7*06c3fb27SDimitry Andric *===-----------------------------------------------------------------------===
8*06c3fb27SDimitry Andric */
9*06c3fb27SDimitry Andric
10*06c3fb27SDimitry Andric #ifndef __IMMINTRIN_H
11*06c3fb27SDimitry Andric #error \
12*06c3fb27SDimitry Andric "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13*06c3fb27SDimitry Andric #endif // __IMMINTRIN_H
14*06c3fb27SDimitry Andric
15*06c3fb27SDimitry Andric #ifndef __AVXVNNIINT16INTRIN_H
16*06c3fb27SDimitry Andric #define __AVXVNNIINT16INTRIN_H
17*06c3fb27SDimitry Andric
18*06c3fb27SDimitry Andric /* Define the default attributes for the functions in this file. */
19*06c3fb27SDimitry Andric #define __DEFAULT_FN_ATTRS128 \
20*06c3fb27SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21*06c3fb27SDimitry Andric __min_vector_width__(128)))
22*06c3fb27SDimitry Andric #define __DEFAULT_FN_ATTRS256 \
23*06c3fb27SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24*06c3fb27SDimitry Andric __min_vector_width__(256)))
25*06c3fb27SDimitry Andric
26*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
29*06c3fb27SDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30*06c3fb27SDimitry Andric ///
31*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
32*06c3fb27SDimitry Andric ///
33*06c3fb27SDimitry Andric /// \code
34*06c3fb27SDimitry Andric /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35*06c3fb27SDimitry Andric /// \endcode
36*06c3fb27SDimitry Andric ///
37*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUD instruction.
38*06c3fb27SDimitry Andric ///
39*06c3fb27SDimitry Andric /// \param __W
40*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
41*06c3fb27SDimitry Andric /// \param __A
42*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x short].
43*06c3fb27SDimitry Andric /// \param __B
44*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
45*06c3fb27SDimitry Andric /// \returns
46*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
47*06c3fb27SDimitry Andric ///
48*06c3fb27SDimitry Andric /// \code{.operation}
49*06c3fb27SDimitry Andric /// FOR j := 0 to 3
50*06c3fb27SDimitry Andric /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51*06c3fb27SDimitry Andric /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52*06c3fb27SDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53*06c3fb27SDimitry Andric /// ENDFOR
54*06c3fb27SDimitry Andric /// dst[MAX:128] := 0
55*06c3fb27SDimitry Andric /// \endcode
_mm_dpwsud_epi32(__m128i __W,__m128i __A,__m128i __B)56*06c3fb27SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57*06c3fb27SDimitry Andric __m128i __A,
58*06c3fb27SDimitry Andric __m128i __B) {
59*06c3fb27SDimitry Andric return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60*06c3fb27SDimitry Andric (__v4si)__B);
61*06c3fb27SDimitry Andric }
62*06c3fb27SDimitry Andric
63*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
66*06c3fb27SDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67*06c3fb27SDimitry Andric ///
68*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
69*06c3fb27SDimitry Andric ///
70*06c3fb27SDimitry Andric /// \code
71*06c3fb27SDimitry Andric /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72*06c3fb27SDimitry Andric /// \endcode
73*06c3fb27SDimitry Andric ///
74*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUD instruction.
75*06c3fb27SDimitry Andric ///
76*06c3fb27SDimitry Andric /// \param __W
77*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
78*06c3fb27SDimitry Andric /// \param __A
79*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x short].
80*06c3fb27SDimitry Andric /// \param __B
81*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
82*06c3fb27SDimitry Andric /// \returns
83*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
84*06c3fb27SDimitry Andric ///
85*06c3fb27SDimitry Andric /// \code{.operation}
86*06c3fb27SDimitry Andric /// FOR j := 0 to 7
87*06c3fb27SDimitry Andric /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88*06c3fb27SDimitry Andric /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89*06c3fb27SDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90*06c3fb27SDimitry Andric /// ENDFOR
91*06c3fb27SDimitry Andric /// dst[MAX:256] := 0
92*06c3fb27SDimitry Andric /// \endcode
93*06c3fb27SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsud_epi32(__m256i __W,__m256i __A,__m256i __B)94*06c3fb27SDimitry Andric _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95*06c3fb27SDimitry Andric return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96*06c3fb27SDimitry Andric (__v8si)__B);
97*06c3fb27SDimitry Andric }
98*06c3fb27SDimitry Andric
99*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
102*06c3fb27SDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
103*06c3fb27SDimitry Andric /// 32-bit results in \a dst.
104*06c3fb27SDimitry Andric ///
105*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
106*06c3fb27SDimitry Andric ///
107*06c3fb27SDimitry Andric /// \code
108*06c3fb27SDimitry Andric /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109*06c3fb27SDimitry Andric /// \endcode
110*06c3fb27SDimitry Andric ///
111*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112*06c3fb27SDimitry Andric ///
113*06c3fb27SDimitry Andric /// \param __W
114*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
115*06c3fb27SDimitry Andric /// \param __A
116*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x short].
117*06c3fb27SDimitry Andric /// \param __B
118*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
119*06c3fb27SDimitry Andric /// \returns
120*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
121*06c3fb27SDimitry Andric ///
122*06c3fb27SDimitry Andric /// \code{.operation}
123*06c3fb27SDimitry Andric /// FOR j := 0 to 3
124*06c3fb27SDimitry Andric /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125*06c3fb27SDimitry Andric /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126*06c3fb27SDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127*06c3fb27SDimitry Andric /// ENDFOR
128*06c3fb27SDimitry Andric /// dst[MAX:128] := 0
129*06c3fb27SDimitry Andric /// \endcode
_mm_dpwsuds_epi32(__m128i __W,__m128i __A,__m128i __B)130*06c3fb27SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131*06c3fb27SDimitry Andric __m128i __A,
132*06c3fb27SDimitry Andric __m128i __B) {
133*06c3fb27SDimitry Andric return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134*06c3fb27SDimitry Andric (__v4si)__B);
135*06c3fb27SDimitry Andric }
136*06c3fb27SDimitry Andric
137*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
140*06c3fb27SDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
141*06c3fb27SDimitry Andric /// 32-bit results in \a dst.
142*06c3fb27SDimitry Andric ///
143*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
144*06c3fb27SDimitry Andric ///
145*06c3fb27SDimitry Andric /// \code
146*06c3fb27SDimitry Andric /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147*06c3fb27SDimitry Andric /// \endcode
148*06c3fb27SDimitry Andric ///
149*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150*06c3fb27SDimitry Andric ///
151*06c3fb27SDimitry Andric /// \param __W
152*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
153*06c3fb27SDimitry Andric /// \param __A
154*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x short].
155*06c3fb27SDimitry Andric /// \param __B
156*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
157*06c3fb27SDimitry Andric /// \returns
158*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
159*06c3fb27SDimitry Andric ///
160*06c3fb27SDimitry Andric /// \code{.operation}
161*06c3fb27SDimitry Andric /// FOR j := 0 to 7
162*06c3fb27SDimitry Andric /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163*06c3fb27SDimitry Andric /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164*06c3fb27SDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165*06c3fb27SDimitry Andric /// ENDFOR
166*06c3fb27SDimitry Andric /// dst[MAX:256] := 0
167*06c3fb27SDimitry Andric /// \endcode
168*06c3fb27SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsuds_epi32(__m256i __W,__m256i __A,__m256i __B)169*06c3fb27SDimitry Andric _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170*06c3fb27SDimitry Andric return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171*06c3fb27SDimitry Andric (__v8si)__B);
172*06c3fb27SDimitry Andric }
173*06c3fb27SDimitry Andric
174*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175*06c3fb27SDimitry Andric /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
177*06c3fb27SDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178*06c3fb27SDimitry Andric ///
179*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
180*06c3fb27SDimitry Andric ///
181*06c3fb27SDimitry Andric /// \code
182*06c3fb27SDimitry Andric /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183*06c3fb27SDimitry Andric /// \endcode
184*06c3fb27SDimitry Andric ///
185*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWUSD instruction.
186*06c3fb27SDimitry Andric ///
187*06c3fb27SDimitry Andric /// \param __W
188*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
189*06c3fb27SDimitry Andric /// \param __A
190*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
191*06c3fb27SDimitry Andric /// \param __B
192*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x short].
193*06c3fb27SDimitry Andric /// \returns
194*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
195*06c3fb27SDimitry Andric ///
196*06c3fb27SDimitry Andric /// \code{.operation}
197*06c3fb27SDimitry Andric /// FOR j := 0 to 3
198*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200*06c3fb27SDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201*06c3fb27SDimitry Andric /// ENDFOR
202*06c3fb27SDimitry Andric /// dst[MAX:128] := 0
203*06c3fb27SDimitry Andric /// \endcode
_mm_dpwusd_epi32(__m128i __W,__m128i __A,__m128i __B)204*06c3fb27SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205*06c3fb27SDimitry Andric __m128i __A,
206*06c3fb27SDimitry Andric __m128i __B) {
207*06c3fb27SDimitry Andric return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208*06c3fb27SDimitry Andric (__v4si)__B);
209*06c3fb27SDimitry Andric }
210*06c3fb27SDimitry Andric
211*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212*06c3fb27SDimitry Andric /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
214*06c3fb27SDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215*06c3fb27SDimitry Andric ///
216*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
217*06c3fb27SDimitry Andric ///
218*06c3fb27SDimitry Andric /// \code
219*06c3fb27SDimitry Andric /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220*06c3fb27SDimitry Andric /// \endcode
221*06c3fb27SDimitry Andric ///
222*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWUSD instruction.
223*06c3fb27SDimitry Andric ///
224*06c3fb27SDimitry Andric /// \param __W
225*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
226*06c3fb27SDimitry Andric /// \param __A
227*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
228*06c3fb27SDimitry Andric /// \param __B
229*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x short].
230*06c3fb27SDimitry Andric /// \returns
231*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
232*06c3fb27SDimitry Andric ///
233*06c3fb27SDimitry Andric /// \code{.operation}
234*06c3fb27SDimitry Andric /// FOR j := 0 to 7
235*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237*06c3fb27SDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238*06c3fb27SDimitry Andric /// ENDFOR
239*06c3fb27SDimitry Andric /// dst[MAX:256] := 0
240*06c3fb27SDimitry Andric /// \endcode
241*06c3fb27SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusd_epi32(__m256i __W,__m256i __A,__m256i __B)242*06c3fb27SDimitry Andric _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243*06c3fb27SDimitry Andric return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244*06c3fb27SDimitry Andric (__v8si)__B);
245*06c3fb27SDimitry Andric }
246*06c3fb27SDimitry Andric
247*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248*06c3fb27SDimitry Andric /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
250*06c3fb27SDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
251*06c3fb27SDimitry Andric /// 32-bit results in \a dst.
252*06c3fb27SDimitry Andric ///
253*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
254*06c3fb27SDimitry Andric ///
255*06c3fb27SDimitry Andric /// \code
256*06c3fb27SDimitry Andric /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257*06c3fb27SDimitry Andric /// \endcode
258*06c3fb27SDimitry Andric ///
259*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260*06c3fb27SDimitry Andric ///
261*06c3fb27SDimitry Andric /// \param __W
262*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
263*06c3fb27SDimitry Andric /// \param __A
264*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
265*06c3fb27SDimitry Andric /// \param __B
266*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x short].
267*06c3fb27SDimitry Andric /// \returns
268*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x int].
269*06c3fb27SDimitry Andric ///
270*06c3fb27SDimitry Andric /// \code{.operation}
271*06c3fb27SDimitry Andric /// FOR j := 0 to 3
272*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274*06c3fb27SDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275*06c3fb27SDimitry Andric /// ENDFOR
276*06c3fb27SDimitry Andric /// dst[MAX:128] := 0
277*06c3fb27SDimitry Andric /// \endcode
_mm_dpwusds_epi32(__m128i __W,__m128i __A,__m128i __B)278*06c3fb27SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279*06c3fb27SDimitry Andric __m128i __A,
280*06c3fb27SDimitry Andric __m128i __B) {
281*06c3fb27SDimitry Andric return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282*06c3fb27SDimitry Andric (__v4si)__B);
283*06c3fb27SDimitry Andric }
284*06c3fb27SDimitry Andric
285*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286*06c3fb27SDimitry Andric /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
288*06c3fb27SDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
289*06c3fb27SDimitry Andric /// 32-bit results in \a dst.
290*06c3fb27SDimitry Andric ///
291*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
292*06c3fb27SDimitry Andric ///
293*06c3fb27SDimitry Andric /// \code
294*06c3fb27SDimitry Andric /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295*06c3fb27SDimitry Andric /// \endcode
296*06c3fb27SDimitry Andric ///
297*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298*06c3fb27SDimitry Andric ///
299*06c3fb27SDimitry Andric /// \param __W
300*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
301*06c3fb27SDimitry Andric /// \param __A
302*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
303*06c3fb27SDimitry Andric /// \param __B
304*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x short].
305*06c3fb27SDimitry Andric /// \returns
306*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x int].
307*06c3fb27SDimitry Andric ///
308*06c3fb27SDimitry Andric /// \code{.operation}
309*06c3fb27SDimitry Andric /// FOR j := 0 to 7
310*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312*06c3fb27SDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313*06c3fb27SDimitry Andric /// ENDFOR
314*06c3fb27SDimitry Andric /// dst[MAX:256] := 0
315*06c3fb27SDimitry Andric /// \endcode
316*06c3fb27SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusds_epi32(__m256i __W,__m256i __A,__m256i __B)317*06c3fb27SDimitry Andric _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318*06c3fb27SDimitry Andric return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319*06c3fb27SDimitry Andric (__v8si)__B);
320*06c3fb27SDimitry Andric }
321*06c3fb27SDimitry Andric
322*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
325*06c3fb27SDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326*06c3fb27SDimitry Andric ///
327*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
328*06c3fb27SDimitry Andric ///
329*06c3fb27SDimitry Andric /// \code
330*06c3fb27SDimitry Andric /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331*06c3fb27SDimitry Andric /// \endcode
332*06c3fb27SDimitry Andric ///
333*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWUUD instruction.
334*06c3fb27SDimitry Andric ///
335*06c3fb27SDimitry Andric /// \param __W
336*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x unsigned int].
337*06c3fb27SDimitry Andric /// \param __A
338*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
339*06c3fb27SDimitry Andric /// \param __B
340*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
341*06c3fb27SDimitry Andric /// \returns
342*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x unsigned int].
343*06c3fb27SDimitry Andric ///
344*06c3fb27SDimitry Andric /// \code{.operation}
345*06c3fb27SDimitry Andric /// FOR j := 0 to 3
346*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348*06c3fb27SDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349*06c3fb27SDimitry Andric /// ENDFOR
350*06c3fb27SDimitry Andric /// dst[MAX:128] := 0
351*06c3fb27SDimitry Andric /// \endcode
_mm_dpwuud_epi32(__m128i __W,__m128i __A,__m128i __B)352*06c3fb27SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353*06c3fb27SDimitry Andric __m128i __A,
354*06c3fb27SDimitry Andric __m128i __B) {
355*06c3fb27SDimitry Andric return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356*06c3fb27SDimitry Andric (__v4si)__B);
357*06c3fb27SDimitry Andric }
358*06c3fb27SDimitry Andric
359*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
362*06c3fb27SDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363*06c3fb27SDimitry Andric ///
364*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
365*06c3fb27SDimitry Andric ///
366*06c3fb27SDimitry Andric /// \code
367*06c3fb27SDimitry Andric /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368*06c3fb27SDimitry Andric /// \endcode
369*06c3fb27SDimitry Andric ///
370*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWUUD instruction.
371*06c3fb27SDimitry Andric ///
372*06c3fb27SDimitry Andric /// \param __W
373*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x unsigned int].
374*06c3fb27SDimitry Andric /// \param __A
375*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
376*06c3fb27SDimitry Andric /// \param __B
377*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
378*06c3fb27SDimitry Andric /// \returns
379*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x unsigned int].
380*06c3fb27SDimitry Andric ///
381*06c3fb27SDimitry Andric /// \code{.operation}
382*06c3fb27SDimitry Andric /// FOR j := 0 to 7
383*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385*06c3fb27SDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386*06c3fb27SDimitry Andric /// ENDFOR
387*06c3fb27SDimitry Andric /// dst[MAX:256] := 0
388*06c3fb27SDimitry Andric /// \endcode
389*06c3fb27SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuud_epi32(__m256i __W,__m256i __A,__m256i __B)390*06c3fb27SDimitry Andric _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391*06c3fb27SDimitry Andric return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392*06c3fb27SDimitry Andric (__v8si)__B);
393*06c3fb27SDimitry Andric }
394*06c3fb27SDimitry Andric
395*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
398*06c3fb27SDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
399*06c3fb27SDimitry Andric /// 32-bit results in \a dst.
400*06c3fb27SDimitry Andric ///
401*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
402*06c3fb27SDimitry Andric ///
403*06c3fb27SDimitry Andric /// \code
404*06c3fb27SDimitry Andric /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405*06c3fb27SDimitry Andric /// \endcode
406*06c3fb27SDimitry Andric ///
407*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408*06c3fb27SDimitry Andric ///
409*06c3fb27SDimitry Andric /// \param __W
410*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x unsigned int].
411*06c3fb27SDimitry Andric /// \param __A
412*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
413*06c3fb27SDimitry Andric /// \param __B
414*06c3fb27SDimitry Andric /// A 128-bit vector of [8 x unsigned short].
415*06c3fb27SDimitry Andric /// \returns
416*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x unsigned int].
417*06c3fb27SDimitry Andric ///
418*06c3fb27SDimitry Andric /// \code{.operation}
419*06c3fb27SDimitry Andric /// FOR j := 0 to 3
420*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422*06c3fb27SDimitry Andric /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423*06c3fb27SDimitry Andric /// ENDFOR
424*06c3fb27SDimitry Andric /// dst[MAX:128] := 0
425*06c3fb27SDimitry Andric /// \endcode
_mm_dpwuuds_epi32(__m128i __W,__m128i __A,__m128i __B)426*06c3fb27SDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427*06c3fb27SDimitry Andric __m128i __A,
428*06c3fb27SDimitry Andric __m128i __B) {
429*06c3fb27SDimitry Andric return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430*06c3fb27SDimitry Andric (__v4si)__B);
431*06c3fb27SDimitry Andric }
432*06c3fb27SDimitry Andric
433*06c3fb27SDimitry Andric /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434*06c3fb27SDimitry Andric /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435*06c3fb27SDimitry Andric /// signed 16-bit results. Sum these 2 results with the corresponding
436*06c3fb27SDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
437*06c3fb27SDimitry Andric /// 32-bit results in \a dst.
438*06c3fb27SDimitry Andric ///
439*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
440*06c3fb27SDimitry Andric ///
441*06c3fb27SDimitry Andric /// \code
442*06c3fb27SDimitry Andric /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443*06c3fb27SDimitry Andric /// \endcode
444*06c3fb27SDimitry Andric ///
445*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446*06c3fb27SDimitry Andric ///
447*06c3fb27SDimitry Andric /// \param __W
448*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x unsigned int].
449*06c3fb27SDimitry Andric /// \param __A
450*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
451*06c3fb27SDimitry Andric /// \param __B
452*06c3fb27SDimitry Andric /// A 256-bit vector of [16 x unsigned short].
453*06c3fb27SDimitry Andric /// \returns
454*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x unsigned int].
455*06c3fb27SDimitry Andric ///
456*06c3fb27SDimitry Andric /// \code{.operation}
457*06c3fb27SDimitry Andric /// FOR j := 0 to 7
458*06c3fb27SDimitry Andric /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459*06c3fb27SDimitry Andric /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460*06c3fb27SDimitry Andric /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461*06c3fb27SDimitry Andric /// ENDFOR
462*06c3fb27SDimitry Andric /// dst[MAX:256] := 0
463*06c3fb27SDimitry Andric /// \endcode
464*06c3fb27SDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuuds_epi32(__m256i __W,__m256i __A,__m256i __B)465*06c3fb27SDimitry Andric _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466*06c3fb27SDimitry Andric return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467*06c3fb27SDimitry Andric (__v8si)__B);
468*06c3fb27SDimitry Andric }
469*06c3fb27SDimitry Andric
470*06c3fb27SDimitry Andric #undef __DEFAULT_FN_ATTRS128
471*06c3fb27SDimitry Andric #undef __DEFAULT_FN_ATTRS256
472*06c3fb27SDimitry Andric
473*06c3fb27SDimitry Andric #endif // __AVXVNNIINT16INTRIN_H
474