xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avxvnniint8intrin.h (revision 02e9120893770924227138ba49df1edb3896112a)
1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error                                                                         \
11     "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12 #endif
13 
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
16 
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256                                                  \
19   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
20                  __min_vector_width__(256)))
21 #define __DEFAULT_FN_ATTRS128                                                  \
22   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
23                  __min_vector_width__(128)))
24 
25 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27 ///    signed 16-bit results. Sum these 4 results with the corresponding
28 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29 ///
30 /// \headerfile <x86intrin.h>
31 ///
32 /// \code
33 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34 /// \endcode
35 ///
36 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
37 ///
38 /// \param __A
39 ///    A 128-bit vector of [16 x char].
40 /// \param __B
41 ///    A 128-bit vector of [16 x char].
42 /// \returns
43 ///    A 128-bit vector of [4 x int].
44 ///
45 /// \code{.operation}
46 /// FOR j := 0 to 3
47 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52 /// ENDFOR
53 /// dst[MAX:128] := 0
54 /// \endcode
55 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56                                                                  __m128i __A,
57                                                                  __m128i __B) {
58   return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59                                              (__v4si)__B);
60 }
61 
62 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64 ///    signed 16-bit results. Sum these 4 results with the corresponding
65 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// \code
70 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71 /// \endcode
72 ///
73 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
74 ///
75 /// \param __A
76 ///    A 256-bit vector of [32 x char].
77 /// \param __B
78 ///    A 256-bit vector of [32 x char].
79 /// \returns
80 ///    A 256-bit vector of [8 x int].
81 ///
82 /// \code{.operation}
83 /// FOR j := 0 to 7
84 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89 /// ENDFOR
90 /// dst[MAX:256] := 0
91 /// \endcode
92 static __inline__ __m256i __DEFAULT_FN_ATTRS256
93 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94   return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95                                              (__v8si)__B);
96 }
97 
98 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100 ///    signed 16-bit results. Sum these 4 results with the corresponding
101 ///    32-bit integer in \a __W with signed saturation, and store the packed
102 ///    32-bit results in \a dst.
103 ///
104 /// \headerfile <x86intrin.h>
105 ///
106 /// \code
107 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108 /// \endcode
109 ///
110 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
111 ///
112 /// \param __A
113 ///    A 128-bit vector of [16 x char].
114 /// \param __B
115 ///    A 128-bit vector of [16 x char].
116 /// \returns
117 ///    A 128-bit vector of [4 x int].
118 ///
119 /// \code{.operation}
120 /// FOR j := 0 to 3
121 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126 /// ENDFOR
127 /// dst[MAX:128] := 0
128 /// \endcode
129 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130                                                                   __m128i __A,
131                                                                   __m128i __B) {
132   return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133                                               (__v4si)__B);
134 }
135 
136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138 ///    signed 16-bit results. Sum these 4 results with the corresponding
139 ///    32-bit integer in \a __W with signed saturation, and store the packed
140 ///    32-bit results in \a dst.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// \code
145 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146 /// \endcode
147 ///
148 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
149 ///
150 /// \param __A
151 ///    A 256-bit vector of [32 x char].
152 /// \param __B
153 ///    A 256-bit vector of [32 x char].
154 /// \returns
155 ///    A 256-bit vector of [8 x int].
156 ///
157 /// \code{.operation}
158 /// FOR j := 0 to 7
159 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164 /// ENDFOR
165 /// dst[MAX:256] := 0
166 /// \endcode
167 static __inline__ __m256i __DEFAULT_FN_ATTRS256
168 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169   return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170                                               (__v8si)__B);
171 }
172 
173 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175 ///    signed 16-bit results. Sum these 4 results with the corresponding
176 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177 ///
178 /// \headerfile <x86intrin.h>
179 ///
180 /// \code
181 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182 /// \endcode
183 ///
184 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
185 ///
186 /// \param __A
187 ///    A 128-bit vector of [16 x char].
188 /// \param __B
189 ///    A 128-bit vector of [16 x unsigned char].
190 /// \returns
191 ///    A 128-bit vector of [4 x int].
192 ///
193 /// \code{.operation}
194 /// FOR j := 0 to 3
195 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200 /// ENDFOR
201 /// dst[MAX:128] := 0
202 /// \endcode
203 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204                                                                  __m128i __A,
205                                                                  __m128i __B) {
206   return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207                                              (__v4si)__B);
208 }
209 
210 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212 ///    signed 16-bit results. Sum these 4 results with the corresponding
213 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214 ///
215 /// \headerfile <x86intrin.h>
216 ///
217 /// \code
218 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219 /// \endcode
220 ///
221 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
222 ///
223 /// \param __A
224 ///    A 256-bit vector of [32 x char].
225 /// \param __B
226 ///    A 256-bit vector of [32 x unsigned char].
227 /// \returns
228 ///    A 256-bit vector of [8 x int].
229 ///
230 /// \code{.operation}
231 /// FOR j := 0 to 7
232 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237 /// ENDFOR
238 /// dst[MAX:256] := 0
239 /// \endcode
240 static __inline__ __m256i __DEFAULT_FN_ATTRS256
241 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242   return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243                                              (__v8si)__B);
244 }
245 
246 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248 ///    signed 16-bit results. Sum these 4 results with the corresponding
249 ///    32-bit integer in \a __W with signed saturation, and store the packed
250 ///    32-bit results in \a dst.
251 ///
252 /// \headerfile <x86intrin.h>
253 ///
254 /// \code
255 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256 /// \endcode
257 ///
258 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
259 ///
260 /// \param __A
261 ///    A 128-bit vector of [16 x char].
262 /// \param __B
263 ///    A 128-bit vector of [16 x unsigned char].
264 /// \returns
265 ///    A 128-bit vector of [4 x int].
266 ///
267 /// \code{.operation}
268 /// FOR j := 0 to 3
269 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274 /// ENDFOR
275 /// dst[MAX:128] := 0
276 /// \endcode
277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278                                                                   __m128i __A,
279                                                                   __m128i __B) {
280   return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281                                               (__v4si)__B);
282 }
283 
284 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286 ///    signed 16-bit results. Sum these 4 results with the corresponding
287 ///    32-bit integer in \a __W with signed saturation, and store the packed
288 ///    32-bit results in \a dst.
289 ///
290 /// \headerfile <x86intrin.h>
291 ///
292 /// \code
293 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294 /// \endcode
295 ///
296 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
297 ///
298 /// \param __A
299 ///    A 256-bit vector of [32 x char].
300 /// \param __B
301 ///    A 256-bit vector of [32 x unsigned char].
302 /// \returns
303 ///    A 256-bit vector of [8 x int].
304 ///
305 /// \code{.operation}
306 /// FOR j := 0 to 7
307 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312 /// ENDFOR
313 /// dst[MAX:256] := 0
314 /// \endcode
315 static __inline__ __m256i __DEFAULT_FN_ATTRS256
316 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317   return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318                                               (__v8si)__B);
319 }
320 
321 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323 ///    signed 16-bit results. Sum these 4 results with the corresponding
324 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325 ///
326 /// \headerfile <x86intrin.h>
327 ///
328 /// \code
329 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330 /// \endcode
331 ///
332 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
333 ///
334 /// \param __A
335 ///    A 128-bit vector of [16 x unsigned char].
336 /// \param __B
337 ///    A 128-bit vector of [16 x unsigned char].
338 /// \returns
339 ///    A 128-bit vector of [4 x int].
340 ///
341 /// \code{.operation}
342 /// FOR j := 0 to 3
343 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348 /// ENDFOR
349 /// dst[MAX:128] := 0
350 /// \endcode
351 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352                                                                  __m128i __A,
353                                                                  __m128i __B) {
354   return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355                                              (__v4si)__B);
356 }
357 
358 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360 ///    signed 16-bit results. Sum these 4 results with the corresponding
361 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362 ///
363 /// \headerfile <x86intrin.h>
364 ///
365 /// \code
366 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367 /// \endcode
368 ///
369 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
370 ///
371 /// \param __A
372 ///    A 256-bit vector of [32 x unsigned char].
373 /// \param __B
374 ///    A 256-bit vector of [32 x unsigned char].
375 /// \returns
376 ///    A 256-bit vector of [8 x int].
377 ///
378 /// \code{.operation}
379 /// FOR j := 0 to 7
380 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385 /// ENDFOR
386 /// dst[MAX:256] := 0
387 /// \endcode
388 static __inline__ __m256i __DEFAULT_FN_ATTRS256
389 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390   return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391                                              (__v8si)__B);
392 }
393 
394 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396 ///    signed 16-bit results. Sum these 4 results with the corresponding
397 ///    32-bit integer in \a __W with signed saturation, and store the packed
398 ///    32-bit results in \a dst.
399 ///
400 /// \headerfile <x86intrin.h>
401 ///
402 /// \code
403 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404 /// \endcode
405 ///
406 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407 ///
408 /// \param __A
409 ///    A 128-bit vector of [16 x unsigned char].
410 /// \param __B
411 ///    A 128-bit vector of [16 x unsigned char].
412 /// \returns
413 ///    A 128-bit vector of [4 x int].
414 ///
415 /// \code{.operation}
416 /// FOR j := 0 to 3
417 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422 /// ENDFOR
423 /// dst[MAX:128] := 0
424 /// \endcode
425 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426                                                                   __m128i __A,
427                                                                   __m128i __B) {
428   return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429                                               (__v4si)__B);
430 }
431 
432 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434 ///    signed 16-bit results. Sum these 4 results with the corresponding
435 ///    32-bit integer in \a __W with signed saturation, and store the packed
436 ///    32-bit results in \a dst.
437 ///
438 /// \headerfile <x86intrin.h>
439 ///
440 /// \code
441 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442 /// \endcode
443 ///
444 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445 ///
446 /// \param __A
447 ///    A 256-bit vector of [32 x unsigned char].
448 /// \param __B
449 ///    A 256-bit vector of [32 x unsigned char].
450 /// \returns
451 ///    A 256-bit vector of [8 x int].
452 ///
453 /// \code{.operation}
454 /// FOR j := 0 to 7
455 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460 /// ENDFOR
461 /// dst[MAX:256] := 0
462 /// \endcode
463 static __inline__ __m256i __DEFAULT_FN_ATTRS256
464 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465   return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466                                               (__v8si)__B);
467 }
468 #undef __DEFAULT_FN_ATTRS128
469 #undef __DEFAULT_FN_ATTRS256
470 
471 #endif // __AVXVNNIINT8INTRIN_H
472