1*bdd1243dSDimitry Andric /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2*bdd1243dSDimitry Andric *
3*bdd1243dSDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bdd1243dSDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
5*bdd1243dSDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bdd1243dSDimitry Andric *
7*bdd1243dSDimitry Andric *===-----------------------------------------------------------------------===
8*bdd1243dSDimitry Andric */
9*bdd1243dSDimitry Andric #ifndef __IMMINTRIN_H
10*bdd1243dSDimitry Andric #error \
11*bdd1243dSDimitry Andric "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12*bdd1243dSDimitry Andric #endif
13*bdd1243dSDimitry Andric
14*bdd1243dSDimitry Andric #ifndef __AVXVNNIINT8INTRIN_H
15*bdd1243dSDimitry Andric #define __AVXVNNIINT8INTRIN_H
16*bdd1243dSDimitry Andric
17*bdd1243dSDimitry Andric /* Define the default attributes for the functions in this file. */
18*bdd1243dSDimitry Andric #define __DEFAULT_FN_ATTRS256 \
19*bdd1243dSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20*bdd1243dSDimitry Andric __min_vector_width__(256)))
21*bdd1243dSDimitry Andric #define __DEFAULT_FN_ATTRS128 \
22*bdd1243dSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23*bdd1243dSDimitry Andric __min_vector_width__(128)))
24*bdd1243dSDimitry Andric
25*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26*bdd1243dSDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
28*bdd1243dSDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29*bdd1243dSDimitry Andric ///
30*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
31*bdd1243dSDimitry Andric ///
32*bdd1243dSDimitry Andric /// \code
33*bdd1243dSDimitry Andric /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34*bdd1243dSDimitry Andric /// \endcode
35*bdd1243dSDimitry Andric ///
36*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
37*bdd1243dSDimitry Andric ///
38*bdd1243dSDimitry Andric /// \param __A
39*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x char].
40*bdd1243dSDimitry Andric /// \param __B
41*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x char].
42*bdd1243dSDimitry Andric /// \returns
43*bdd1243dSDimitry Andric /// A 128-bit vector of [4 x int].
44*bdd1243dSDimitry Andric ///
45*bdd1243dSDimitry Andric /// \code{.operation}
46*bdd1243dSDimitry Andric /// FOR j := 0 to 3
47*bdd1243dSDimitry Andric /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48*bdd1243dSDimitry Andric /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49*bdd1243dSDimitry Andric /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50*bdd1243dSDimitry Andric /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51*bdd1243dSDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52*bdd1243dSDimitry Andric /// ENDFOR
53*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
54*bdd1243dSDimitry Andric /// \endcode
_mm_dpbssd_epi32(__m128i __W,__m128i __A,__m128i __B)55*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56*bdd1243dSDimitry Andric __m128i __A,
57*bdd1243dSDimitry Andric __m128i __B) {
58*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59*bdd1243dSDimitry Andric (__v4si)__B);
60*bdd1243dSDimitry Andric }
61*bdd1243dSDimitry Andric
62*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63*bdd1243dSDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
65*bdd1243dSDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66*bdd1243dSDimitry Andric ///
67*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
68*bdd1243dSDimitry Andric ///
69*bdd1243dSDimitry Andric /// \code
70*bdd1243dSDimitry Andric /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71*bdd1243dSDimitry Andric /// \endcode
72*bdd1243dSDimitry Andric ///
73*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
74*bdd1243dSDimitry Andric ///
75*bdd1243dSDimitry Andric /// \param __A
76*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x char].
77*bdd1243dSDimitry Andric /// \param __B
78*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x char].
79*bdd1243dSDimitry Andric /// \returns
80*bdd1243dSDimitry Andric /// A 256-bit vector of [8 x int].
81*bdd1243dSDimitry Andric ///
82*bdd1243dSDimitry Andric /// \code{.operation}
83*bdd1243dSDimitry Andric /// FOR j := 0 to 7
84*bdd1243dSDimitry Andric /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85*bdd1243dSDimitry Andric /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86*bdd1243dSDimitry Andric /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87*bdd1243dSDimitry Andric /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88*bdd1243dSDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89*bdd1243dSDimitry Andric /// ENDFOR
90*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
91*bdd1243dSDimitry Andric /// \endcode
92*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssd_epi32(__m256i __W,__m256i __A,__m256i __B)93*bdd1243dSDimitry Andric _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95*bdd1243dSDimitry Andric (__v8si)__B);
96*bdd1243dSDimitry Andric }
97*bdd1243dSDimitry Andric
98*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99*bdd1243dSDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
101*bdd1243dSDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
102*bdd1243dSDimitry Andric /// 32-bit results in \a dst.
103*bdd1243dSDimitry Andric ///
104*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
105*bdd1243dSDimitry Andric ///
106*bdd1243dSDimitry Andric /// \code
107*bdd1243dSDimitry Andric /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108*bdd1243dSDimitry Andric /// \endcode
109*bdd1243dSDimitry Andric ///
110*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
111*bdd1243dSDimitry Andric ///
112*bdd1243dSDimitry Andric /// \param __A
113*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x char].
114*bdd1243dSDimitry Andric /// \param __B
115*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x char].
116*bdd1243dSDimitry Andric /// \returns
117*bdd1243dSDimitry Andric /// A 128-bit vector of [4 x int].
118*bdd1243dSDimitry Andric ///
119*bdd1243dSDimitry Andric /// \code{.operation}
120*bdd1243dSDimitry Andric /// FOR j := 0 to 3
121*bdd1243dSDimitry Andric /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122*bdd1243dSDimitry Andric /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123*bdd1243dSDimitry Andric /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124*bdd1243dSDimitry Andric /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125*bdd1243dSDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126*bdd1243dSDimitry Andric /// ENDFOR
127*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
128*bdd1243dSDimitry Andric /// \endcode
_mm_dpbssds_epi32(__m128i __W,__m128i __A,__m128i __B)129*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130*bdd1243dSDimitry Andric __m128i __A,
131*bdd1243dSDimitry Andric __m128i __B) {
132*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133*bdd1243dSDimitry Andric (__v4si)__B);
134*bdd1243dSDimitry Andric }
135*bdd1243dSDimitry Andric
136*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137*bdd1243dSDimitry Andric /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
139*bdd1243dSDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
140*bdd1243dSDimitry Andric /// 32-bit results in \a dst.
141*bdd1243dSDimitry Andric ///
142*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
143*bdd1243dSDimitry Andric ///
144*bdd1243dSDimitry Andric /// \code
145*bdd1243dSDimitry Andric /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146*bdd1243dSDimitry Andric /// \endcode
147*bdd1243dSDimitry Andric ///
148*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
149*bdd1243dSDimitry Andric ///
150*bdd1243dSDimitry Andric /// \param __A
151*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x char].
152*bdd1243dSDimitry Andric /// \param __B
153*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x char].
154*bdd1243dSDimitry Andric /// \returns
155*bdd1243dSDimitry Andric /// A 256-bit vector of [8 x int].
156*bdd1243dSDimitry Andric ///
157*bdd1243dSDimitry Andric /// \code{.operation}
158*bdd1243dSDimitry Andric /// FOR j := 0 to 7
159*bdd1243dSDimitry Andric /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160*bdd1243dSDimitry Andric /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161*bdd1243dSDimitry Andric /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162*bdd1243dSDimitry Andric /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163*bdd1243dSDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164*bdd1243dSDimitry Andric /// ENDFOR
165*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
166*bdd1243dSDimitry Andric /// \endcode
167*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssds_epi32(__m256i __W,__m256i __A,__m256i __B)168*bdd1243dSDimitry Andric _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170*bdd1243dSDimitry Andric (__v8si)__B);
171*bdd1243dSDimitry Andric }
172*bdd1243dSDimitry Andric
173*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
176*bdd1243dSDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177*bdd1243dSDimitry Andric ///
178*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
179*bdd1243dSDimitry Andric ///
180*bdd1243dSDimitry Andric /// \code
181*bdd1243dSDimitry Andric /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182*bdd1243dSDimitry Andric /// \endcode
183*bdd1243dSDimitry Andric ///
184*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
185*bdd1243dSDimitry Andric ///
186*bdd1243dSDimitry Andric /// \param __A
187*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x char].
188*bdd1243dSDimitry Andric /// \param __B
189*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x unsigned char].
190*bdd1243dSDimitry Andric /// \returns
191*bdd1243dSDimitry Andric /// A 128-bit vector of [4 x int].
192*bdd1243dSDimitry Andric ///
193*bdd1243dSDimitry Andric /// \code{.operation}
194*bdd1243dSDimitry Andric /// FOR j := 0 to 3
195*bdd1243dSDimitry Andric /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196*bdd1243dSDimitry Andric /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197*bdd1243dSDimitry Andric /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198*bdd1243dSDimitry Andric /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199*bdd1243dSDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200*bdd1243dSDimitry Andric /// ENDFOR
201*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
202*bdd1243dSDimitry Andric /// \endcode
_mm_dpbsud_epi32(__m128i __W,__m128i __A,__m128i __B)203*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204*bdd1243dSDimitry Andric __m128i __A,
205*bdd1243dSDimitry Andric __m128i __B) {
206*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207*bdd1243dSDimitry Andric (__v4si)__B);
208*bdd1243dSDimitry Andric }
209*bdd1243dSDimitry Andric
210*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
213*bdd1243dSDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214*bdd1243dSDimitry Andric ///
215*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
216*bdd1243dSDimitry Andric ///
217*bdd1243dSDimitry Andric /// \code
218*bdd1243dSDimitry Andric /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219*bdd1243dSDimitry Andric /// \endcode
220*bdd1243dSDimitry Andric ///
221*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
222*bdd1243dSDimitry Andric ///
223*bdd1243dSDimitry Andric /// \param __A
224*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x char].
225*bdd1243dSDimitry Andric /// \param __B
226*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x unsigned char].
227*bdd1243dSDimitry Andric /// \returns
228*bdd1243dSDimitry Andric /// A 256-bit vector of [8 x int].
229*bdd1243dSDimitry Andric ///
230*bdd1243dSDimitry Andric /// \code{.operation}
231*bdd1243dSDimitry Andric /// FOR j := 0 to 7
232*bdd1243dSDimitry Andric /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233*bdd1243dSDimitry Andric /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234*bdd1243dSDimitry Andric /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235*bdd1243dSDimitry Andric /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236*bdd1243dSDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237*bdd1243dSDimitry Andric /// ENDFOR
238*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
239*bdd1243dSDimitry Andric /// \endcode
240*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsud_epi32(__m256i __W,__m256i __A,__m256i __B)241*bdd1243dSDimitry Andric _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243*bdd1243dSDimitry Andric (__v8si)__B);
244*bdd1243dSDimitry Andric }
245*bdd1243dSDimitry Andric
246*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
249*bdd1243dSDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
250*bdd1243dSDimitry Andric /// 32-bit results in \a dst.
251*bdd1243dSDimitry Andric ///
252*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
253*bdd1243dSDimitry Andric ///
254*bdd1243dSDimitry Andric /// \code
255*bdd1243dSDimitry Andric /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256*bdd1243dSDimitry Andric /// \endcode
257*bdd1243dSDimitry Andric ///
258*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
259*bdd1243dSDimitry Andric ///
260*bdd1243dSDimitry Andric /// \param __A
261*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x char].
262*bdd1243dSDimitry Andric /// \param __B
263*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x unsigned char].
264*bdd1243dSDimitry Andric /// \returns
265*bdd1243dSDimitry Andric /// A 128-bit vector of [4 x int].
266*bdd1243dSDimitry Andric ///
267*bdd1243dSDimitry Andric /// \code{.operation}
268*bdd1243dSDimitry Andric /// FOR j := 0 to 3
269*bdd1243dSDimitry Andric /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270*bdd1243dSDimitry Andric /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271*bdd1243dSDimitry Andric /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272*bdd1243dSDimitry Andric /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273*bdd1243dSDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274*bdd1243dSDimitry Andric /// ENDFOR
275*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
276*bdd1243dSDimitry Andric /// \endcode
_mm_dpbsuds_epi32(__m128i __W,__m128i __A,__m128i __B)277*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278*bdd1243dSDimitry Andric __m128i __A,
279*bdd1243dSDimitry Andric __m128i __B) {
280*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281*bdd1243dSDimitry Andric (__v4si)__B);
282*bdd1243dSDimitry Andric }
283*bdd1243dSDimitry Andric
284*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
287*bdd1243dSDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
288*bdd1243dSDimitry Andric /// 32-bit results in \a dst.
289*bdd1243dSDimitry Andric ///
290*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
291*bdd1243dSDimitry Andric ///
292*bdd1243dSDimitry Andric /// \code
293*bdd1243dSDimitry Andric /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294*bdd1243dSDimitry Andric /// \endcode
295*bdd1243dSDimitry Andric ///
296*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
297*bdd1243dSDimitry Andric ///
298*bdd1243dSDimitry Andric /// \param __A
299*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x char].
300*bdd1243dSDimitry Andric /// \param __B
301*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x unsigned char].
302*bdd1243dSDimitry Andric /// \returns
303*bdd1243dSDimitry Andric /// A 256-bit vector of [8 x int].
304*bdd1243dSDimitry Andric ///
305*bdd1243dSDimitry Andric /// \code{.operation}
306*bdd1243dSDimitry Andric /// FOR j := 0 to 7
307*bdd1243dSDimitry Andric /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308*bdd1243dSDimitry Andric /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309*bdd1243dSDimitry Andric /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310*bdd1243dSDimitry Andric /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311*bdd1243dSDimitry Andric /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312*bdd1243dSDimitry Andric /// ENDFOR
313*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
314*bdd1243dSDimitry Andric /// \endcode
315*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsuds_epi32(__m256i __W,__m256i __A,__m256i __B)316*bdd1243dSDimitry Andric _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318*bdd1243dSDimitry Andric (__v8si)__B);
319*bdd1243dSDimitry Andric }
320*bdd1243dSDimitry Andric
321*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
324*bdd1243dSDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325*bdd1243dSDimitry Andric ///
326*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
327*bdd1243dSDimitry Andric ///
328*bdd1243dSDimitry Andric /// \code
329*bdd1243dSDimitry Andric /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330*bdd1243dSDimitry Andric /// \endcode
331*bdd1243dSDimitry Andric ///
332*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
333*bdd1243dSDimitry Andric ///
334*bdd1243dSDimitry Andric /// \param __A
335*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x unsigned char].
336*bdd1243dSDimitry Andric /// \param __B
337*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x unsigned char].
338*bdd1243dSDimitry Andric /// \returns
339*bdd1243dSDimitry Andric /// A 128-bit vector of [4 x int].
340*bdd1243dSDimitry Andric ///
341*bdd1243dSDimitry Andric /// \code{.operation}
342*bdd1243dSDimitry Andric /// FOR j := 0 to 3
343*bdd1243dSDimitry Andric /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344*bdd1243dSDimitry Andric /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345*bdd1243dSDimitry Andric /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346*bdd1243dSDimitry Andric /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347*bdd1243dSDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348*bdd1243dSDimitry Andric /// ENDFOR
349*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
350*bdd1243dSDimitry Andric /// \endcode
_mm_dpbuud_epi32(__m128i __W,__m128i __A,__m128i __B)351*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352*bdd1243dSDimitry Andric __m128i __A,
353*bdd1243dSDimitry Andric __m128i __B) {
354*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355*bdd1243dSDimitry Andric (__v4si)__B);
356*bdd1243dSDimitry Andric }
357*bdd1243dSDimitry Andric
358*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
361*bdd1243dSDimitry Andric /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362*bdd1243dSDimitry Andric ///
363*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
364*bdd1243dSDimitry Andric ///
365*bdd1243dSDimitry Andric /// \code
366*bdd1243dSDimitry Andric /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367*bdd1243dSDimitry Andric /// \endcode
368*bdd1243dSDimitry Andric ///
369*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBSSD instruction.
370*bdd1243dSDimitry Andric ///
371*bdd1243dSDimitry Andric /// \param __A
372*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x unsigned char].
373*bdd1243dSDimitry Andric /// \param __B
374*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x unsigned char].
375*bdd1243dSDimitry Andric /// \returns
376*bdd1243dSDimitry Andric /// A 256-bit vector of [8 x int].
377*bdd1243dSDimitry Andric ///
378*bdd1243dSDimitry Andric /// \code{.operation}
379*bdd1243dSDimitry Andric /// FOR j := 0 to 7
380*bdd1243dSDimitry Andric /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381*bdd1243dSDimitry Andric /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382*bdd1243dSDimitry Andric /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383*bdd1243dSDimitry Andric /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384*bdd1243dSDimitry Andric /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385*bdd1243dSDimitry Andric /// ENDFOR
386*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
387*bdd1243dSDimitry Andric /// \endcode
388*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuud_epi32(__m256i __W,__m256i __A,__m256i __B)389*bdd1243dSDimitry Andric _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391*bdd1243dSDimitry Andric (__v8si)__B);
392*bdd1243dSDimitry Andric }
393*bdd1243dSDimitry Andric
394*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
397*bdd1243dSDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
398*bdd1243dSDimitry Andric /// 32-bit results in \a dst.
399*bdd1243dSDimitry Andric ///
400*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
401*bdd1243dSDimitry Andric ///
402*bdd1243dSDimitry Andric /// \code
403*bdd1243dSDimitry Andric /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404*bdd1243dSDimitry Andric /// \endcode
405*bdd1243dSDimitry Andric ///
406*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407*bdd1243dSDimitry Andric ///
408*bdd1243dSDimitry Andric /// \param __A
409*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x unsigned char].
410*bdd1243dSDimitry Andric /// \param __B
411*bdd1243dSDimitry Andric /// A 128-bit vector of [16 x unsigned char].
412*bdd1243dSDimitry Andric /// \returns
413*bdd1243dSDimitry Andric /// A 128-bit vector of [4 x int].
414*bdd1243dSDimitry Andric ///
415*bdd1243dSDimitry Andric /// \code{.operation}
416*bdd1243dSDimitry Andric /// FOR j := 0 to 3
417*bdd1243dSDimitry Andric /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418*bdd1243dSDimitry Andric /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419*bdd1243dSDimitry Andric /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420*bdd1243dSDimitry Andric /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421*bdd1243dSDimitry Andric /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422*bdd1243dSDimitry Andric /// ENDFOR
423*bdd1243dSDimitry Andric /// dst[MAX:128] := 0
424*bdd1243dSDimitry Andric /// \endcode
_mm_dpbuuds_epi32(__m128i __W,__m128i __A,__m128i __B)425*bdd1243dSDimitry Andric static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426*bdd1243dSDimitry Andric __m128i __A,
427*bdd1243dSDimitry Andric __m128i __B) {
428*bdd1243dSDimitry Andric return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429*bdd1243dSDimitry Andric (__v4si)__B);
430*bdd1243dSDimitry Andric }
431*bdd1243dSDimitry Andric
432*bdd1243dSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433*bdd1243dSDimitry Andric /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434*bdd1243dSDimitry Andric /// signed 16-bit results. Sum these 4 results with the corresponding
435*bdd1243dSDimitry Andric /// 32-bit integer in \a __W with signed saturation, and store the packed
436*bdd1243dSDimitry Andric /// 32-bit results in \a dst.
437*bdd1243dSDimitry Andric ///
438*bdd1243dSDimitry Andric /// \headerfile <x86intrin.h>
439*bdd1243dSDimitry Andric ///
440*bdd1243dSDimitry Andric /// \code
441*bdd1243dSDimitry Andric /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442*bdd1243dSDimitry Andric /// \endcode
443*bdd1243dSDimitry Andric ///
444*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445*bdd1243dSDimitry Andric ///
446*bdd1243dSDimitry Andric /// \param __A
447*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x unsigned char].
448*bdd1243dSDimitry Andric /// \param __B
449*bdd1243dSDimitry Andric /// A 256-bit vector of [32 x unsigned char].
450*bdd1243dSDimitry Andric /// \returns
451*bdd1243dSDimitry Andric /// A 256-bit vector of [8 x int].
452*bdd1243dSDimitry Andric ///
453*bdd1243dSDimitry Andric /// \code{.operation}
454*bdd1243dSDimitry Andric /// FOR j := 0 to 7
455*bdd1243dSDimitry Andric /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456*bdd1243dSDimitry Andric /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457*bdd1243dSDimitry Andric /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458*bdd1243dSDimitry Andric /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459*bdd1243dSDimitry Andric /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460*bdd1243dSDimitry Andric /// ENDFOR
461*bdd1243dSDimitry Andric /// dst[MAX:256] := 0
462*bdd1243dSDimitry Andric /// \endcode
463*bdd1243dSDimitry Andric static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuuds_epi32(__m256i __W,__m256i __A,__m256i __B)464*bdd1243dSDimitry Andric _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465*bdd1243dSDimitry Andric return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466*bdd1243dSDimitry Andric (__v8si)__B);
467*bdd1243dSDimitry Andric }
468*bdd1243dSDimitry Andric #undef __DEFAULT_FN_ATTRS128
469*bdd1243dSDimitry Andric #undef __DEFAULT_FN_ATTRS256
470*bdd1243dSDimitry Andric
471*bdd1243dSDimitry Andric #endif // __AVXVNNIINT8INTRIN_H
472