1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __IMMINTRIN_H
11 #error \
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
14
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
17
18 /* Define the default attributes for the functions in this file. */
19 #define __DEFAULT_FN_ATTRS128 \
20 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21 __min_vector_width__(128)))
22 #define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24 __min_vector_width__(256)))
25
26 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28 /// signed 16-bit results. Sum these 2 results with the corresponding
29 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30 ///
31 /// \headerfile <immintrin.h>
32 ///
33 /// \code
34 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35 /// \endcode
36 ///
37 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
38 ///
39 /// \param __W
40 /// A 128-bit vector of [4 x int].
41 /// \param __A
42 /// A 128-bit vector of [8 x short].
43 /// \param __B
44 /// A 128-bit vector of [8 x unsigned short].
45 /// \returns
46 /// A 128-bit vector of [4 x int].
47 ///
48 /// \code{.operation}
49 /// FOR j := 0 to 3
50 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53 /// ENDFOR
54 /// dst[MAX:128] := 0
55 /// \endcode
_mm_dpwsud_epi32(__m128i __W,__m128i __A,__m128i __B)56 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57 __m128i __A,
58 __m128i __B) {
59 return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60 (__v4si)__B);
61 }
62
63 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65 /// signed 16-bit results. Sum these 2 results with the corresponding
66 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67 ///
68 /// \headerfile <immintrin.h>
69 ///
70 /// \code
71 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72 /// \endcode
73 ///
74 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
75 ///
76 /// \param __W
77 /// A 256-bit vector of [8 x int].
78 /// \param __A
79 /// A 256-bit vector of [16 x short].
80 /// \param __B
81 /// A 256-bit vector of [16 x unsigned short].
82 /// \returns
83 /// A 256-bit vector of [8 x int].
84 ///
85 /// \code{.operation}
86 /// FOR j := 0 to 7
87 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90 /// ENDFOR
91 /// dst[MAX:256] := 0
92 /// \endcode
93 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsud_epi32(__m256i __W,__m256i __A,__m256i __B)94 _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95 return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96 (__v8si)__B);
97 }
98
99 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101 /// signed 16-bit results. Sum these 2 results with the corresponding
102 /// 32-bit integer in \a __W with signed saturation, and store the packed
103 /// 32-bit results in \a dst.
104 ///
105 /// \headerfile <immintrin.h>
106 ///
107 /// \code
108 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109 /// \endcode
110 ///
111 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112 ///
113 /// \param __W
114 /// A 128-bit vector of [4 x int].
115 /// \param __A
116 /// A 128-bit vector of [8 x short].
117 /// \param __B
118 /// A 128-bit vector of [8 x unsigned short].
119 /// \returns
120 /// A 128-bit vector of [4 x int].
121 ///
122 /// \code{.operation}
123 /// FOR j := 0 to 3
124 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127 /// ENDFOR
128 /// dst[MAX:128] := 0
129 /// \endcode
_mm_dpwsuds_epi32(__m128i __W,__m128i __A,__m128i __B)130 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131 __m128i __A,
132 __m128i __B) {
133 return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134 (__v4si)__B);
135 }
136
137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139 /// signed 16-bit results. Sum these 2 results with the corresponding
140 /// 32-bit integer in \a __W with signed saturation, and store the packed
141 /// 32-bit results in \a dst.
142 ///
143 /// \headerfile <immintrin.h>
144 ///
145 /// \code
146 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147 /// \endcode
148 ///
149 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150 ///
151 /// \param __W
152 /// A 256-bit vector of [8 x int].
153 /// \param __A
154 /// A 256-bit vector of [16 x short].
155 /// \param __B
156 /// A 256-bit vector of [16 x unsigned short].
157 /// \returns
158 /// A 256-bit vector of [8 x int].
159 ///
160 /// \code{.operation}
161 /// FOR j := 0 to 7
162 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165 /// ENDFOR
166 /// dst[MAX:256] := 0
167 /// \endcode
168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsuds_epi32(__m256i __W,__m256i __A,__m256i __B)169 _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170 return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171 (__v8si)__B);
172 }
173
174 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176 /// signed 16-bit results. Sum these 2 results with the corresponding
177 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178 ///
179 /// \headerfile <immintrin.h>
180 ///
181 /// \code
182 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183 /// \endcode
184 ///
185 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
186 ///
187 /// \param __W
188 /// A 128-bit vector of [4 x int].
189 /// \param __A
190 /// A 128-bit vector of [8 x unsigned short].
191 /// \param __B
192 /// A 128-bit vector of [8 x short].
193 /// \returns
194 /// A 128-bit vector of [4 x int].
195 ///
196 /// \code{.operation}
197 /// FOR j := 0 to 3
198 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201 /// ENDFOR
202 /// dst[MAX:128] := 0
203 /// \endcode
_mm_dpwusd_epi32(__m128i __W,__m128i __A,__m128i __B)204 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205 __m128i __A,
206 __m128i __B) {
207 return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208 (__v4si)__B);
209 }
210
211 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213 /// signed 16-bit results. Sum these 2 results with the corresponding
214 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215 ///
216 /// \headerfile <immintrin.h>
217 ///
218 /// \code
219 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220 /// \endcode
221 ///
222 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
223 ///
224 /// \param __W
225 /// A 256-bit vector of [8 x int].
226 /// \param __A
227 /// A 256-bit vector of [16 x unsigned short].
228 /// \param __B
229 /// A 256-bit vector of [16 x short].
230 /// \returns
231 /// A 256-bit vector of [8 x int].
232 ///
233 /// \code{.operation}
234 /// FOR j := 0 to 7
235 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238 /// ENDFOR
239 /// dst[MAX:256] := 0
240 /// \endcode
241 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusd_epi32(__m256i __W,__m256i __A,__m256i __B)242 _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243 return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244 (__v8si)__B);
245 }
246
247 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249 /// signed 16-bit results. Sum these 2 results with the corresponding
250 /// 32-bit integer in \a __W with signed saturation, and store the packed
251 /// 32-bit results in \a dst.
252 ///
253 /// \headerfile <immintrin.h>
254 ///
255 /// \code
256 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257 /// \endcode
258 ///
259 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260 ///
261 /// \param __W
262 /// A 128-bit vector of [4 x int].
263 /// \param __A
264 /// A 128-bit vector of [8 x unsigned short].
265 /// \param __B
266 /// A 128-bit vector of [8 x short].
267 /// \returns
268 /// A 128-bit vector of [4 x int].
269 ///
270 /// \code{.operation}
271 /// FOR j := 0 to 3
272 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275 /// ENDFOR
276 /// dst[MAX:128] := 0
277 /// \endcode
_mm_dpwusds_epi32(__m128i __W,__m128i __A,__m128i __B)278 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279 __m128i __A,
280 __m128i __B) {
281 return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282 (__v4si)__B);
283 }
284
285 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287 /// signed 16-bit results. Sum these 2 results with the corresponding
288 /// 32-bit integer in \a __W with signed saturation, and store the packed
289 /// 32-bit results in \a dst.
290 ///
291 /// \headerfile <immintrin.h>
292 ///
293 /// \code
294 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295 /// \endcode
296 ///
297 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298 ///
299 /// \param __W
300 /// A 256-bit vector of [8 x int].
301 /// \param __A
302 /// A 256-bit vector of [16 x unsigned short].
303 /// \param __B
304 /// A 256-bit vector of [16 x short].
305 /// \returns
306 /// A 256-bit vector of [8 x int].
307 ///
308 /// \code{.operation}
309 /// FOR j := 0 to 7
310 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313 /// ENDFOR
314 /// dst[MAX:256] := 0
315 /// \endcode
316 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusds_epi32(__m256i __W,__m256i __A,__m256i __B)317 _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318 return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319 (__v8si)__B);
320 }
321
322 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324 /// signed 16-bit results. Sum these 2 results with the corresponding
325 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326 ///
327 /// \headerfile <immintrin.h>
328 ///
329 /// \code
330 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331 /// \endcode
332 ///
333 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
334 ///
335 /// \param __W
336 /// A 128-bit vector of [4 x unsigned int].
337 /// \param __A
338 /// A 128-bit vector of [8 x unsigned short].
339 /// \param __B
340 /// A 128-bit vector of [8 x unsigned short].
341 /// \returns
342 /// A 128-bit vector of [4 x unsigned int].
343 ///
344 /// \code{.operation}
345 /// FOR j := 0 to 3
346 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349 /// ENDFOR
350 /// dst[MAX:128] := 0
351 /// \endcode
_mm_dpwuud_epi32(__m128i __W,__m128i __A,__m128i __B)352 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353 __m128i __A,
354 __m128i __B) {
355 return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356 (__v4si)__B);
357 }
358
359 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361 /// signed 16-bit results. Sum these 2 results with the corresponding
362 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363 ///
364 /// \headerfile <immintrin.h>
365 ///
366 /// \code
367 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368 /// \endcode
369 ///
370 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
371 ///
372 /// \param __W
373 /// A 256-bit vector of [8 x unsigned int].
374 /// \param __A
375 /// A 256-bit vector of [16 x unsigned short].
376 /// \param __B
377 /// A 256-bit vector of [16 x unsigned short].
378 /// \returns
379 /// A 256-bit vector of [8 x unsigned int].
380 ///
381 /// \code{.operation}
382 /// FOR j := 0 to 7
383 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386 /// ENDFOR
387 /// dst[MAX:256] := 0
388 /// \endcode
389 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuud_epi32(__m256i __W,__m256i __A,__m256i __B)390 _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391 return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392 (__v8si)__B);
393 }
394
395 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397 /// signed 16-bit results. Sum these 2 results with the corresponding
398 /// 32-bit integer in \a __W with signed saturation, and store the packed
399 /// 32-bit results in \a dst.
400 ///
401 /// \headerfile <immintrin.h>
402 ///
403 /// \code
404 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405 /// \endcode
406 ///
407 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408 ///
409 /// \param __W
410 /// A 128-bit vector of [4 x unsigned int].
411 /// \param __A
412 /// A 128-bit vector of [8 x unsigned short].
413 /// \param __B
414 /// A 128-bit vector of [8 x unsigned short].
415 /// \returns
416 /// A 128-bit vector of [4 x unsigned int].
417 ///
418 /// \code{.operation}
419 /// FOR j := 0 to 3
420 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423 /// ENDFOR
424 /// dst[MAX:128] := 0
425 /// \endcode
_mm_dpwuuds_epi32(__m128i __W,__m128i __A,__m128i __B)426 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427 __m128i __A,
428 __m128i __B) {
429 return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430 (__v4si)__B);
431 }
432
433 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435 /// signed 16-bit results. Sum these 2 results with the corresponding
436 /// 32-bit integer in \a __W with signed saturation, and store the packed
437 /// 32-bit results in \a dst.
438 ///
439 /// \headerfile <immintrin.h>
440 ///
441 /// \code
442 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443 /// \endcode
444 ///
445 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446 ///
447 /// \param __W
448 /// A 256-bit vector of [8 x unsigned int].
449 /// \param __A
450 /// A 256-bit vector of [16 x unsigned short].
451 /// \param __B
452 /// A 256-bit vector of [16 x unsigned short].
453 /// \returns
454 /// A 256-bit vector of [8 x unsigned int].
455 ///
456 /// \code{.operation}
457 /// FOR j := 0 to 7
458 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461 /// ENDFOR
462 /// dst[MAX:256] := 0
463 /// \endcode
464 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuuds_epi32(__m256i __W,__m256i __A,__m256i __B)465 _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466 return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467 (__v8si)__B);
468 }
469
470 #undef __DEFAULT_FN_ATTRS128
471 #undef __DEFAULT_FN_ATTRS256
472
473 #endif // __AVXVNNIINT16INTRIN_H
474