1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error \ 12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead." 13 #endif // __IMMINTRIN_H 14 15 #ifndef __AVXVNNIINT16INTRIN_H 16 #define __AVXVNNIINT16INTRIN_H 17 18 /* Define the default attributes for the functions in this file. */ 19 #define __DEFAULT_FN_ATTRS128 \ 20 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ 21 __min_vector_width__(128))) 22 #define __DEFAULT_FN_ATTRS256 \ 23 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ 24 __min_vector_width__(256))) 25 26 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 27 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 28 /// signed 16-bit results. Sum these 2 results with the corresponding 29 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 30 /// 31 /// \headerfile <immintrin.h> 32 /// 33 /// \code 34 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B) 35 /// \endcode 36 /// 37 /// This intrinsic corresponds to the \c VPDPWSUD instruction. 38 /// 39 /// \param __W 40 /// A 128-bit vector of [4 x int]. 41 /// \param __A 42 /// A 128-bit vector of [8 x short]. 43 /// \param __B 44 /// A 128-bit vector of [8 x unsigned short]. 45 /// \returns 46 /// A 128-bit vector of [4 x int]. 47 /// 48 /// \code{.operation} 49 /// FOR j := 0 to 3 50 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 51 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 52 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 53 /// ENDFOR 54 /// dst[MAX:128] := 0 55 /// \endcode 56 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, 57 __m128i __A, 58 __m128i __B) { 59 return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A, 60 (__v4si)__B); 61 } 62 63 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 64 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 65 /// signed 16-bit results. Sum these 2 results with the corresponding 66 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 67 /// 68 /// \headerfile <immintrin.h> 69 /// 70 /// \code 71 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) 72 /// \endcode 73 /// 74 /// This intrinsic corresponds to the \c VPDPWSUD instruction. 75 /// 76 /// \param __W 77 /// A 256-bit vector of [8 x int]. 78 /// \param __A 79 /// A 256-bit vector of [16 x short]. 80 /// \param __B 81 /// A 256-bit vector of [16 x unsigned short]. 82 /// \returns 83 /// A 256-bit vector of [8 x int]. 84 /// 85 /// \code{.operation} 86 /// FOR j := 0 to 7 87 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 88 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 89 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 90 /// ENDFOR 91 /// dst[MAX:256] := 0 92 /// \endcode 93 static __inline__ __m256i __DEFAULT_FN_ATTRS256 94 _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { 95 return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A, 96 (__v8si)__B); 97 } 98 99 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 100 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 101 /// signed 16-bit results. Sum these 2 results with the corresponding 102 /// 32-bit integer in \a __W with signed saturation, and store the packed 103 /// 32-bit results in \a dst. 104 /// 105 /// \headerfile <immintrin.h> 106 /// 107 /// \code 108 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) 109 /// \endcode 110 /// 111 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 112 /// 113 /// \param __W 114 /// A 128-bit vector of [4 x int]. 115 /// \param __A 116 /// A 128-bit vector of [8 x short]. 117 /// \param __B 118 /// A 128-bit vector of [8 x unsigned short]. 119 /// \returns 120 /// A 128-bit vector of [4 x int]. 121 /// 122 /// \code{.operation} 123 /// FOR j := 0 to 3 124 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 125 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 126 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 127 /// ENDFOR 128 /// dst[MAX:128] := 0 129 /// \endcode 130 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, 131 __m128i __A, 132 __m128i __B) { 133 return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A, 134 (__v4si)__B); 135 } 136 137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 138 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 139 /// signed 16-bit results. Sum these 2 results with the corresponding 140 /// 32-bit integer in \a __W with signed saturation, and store the packed 141 /// 32-bit results in \a dst. 142 /// 143 /// \headerfile <immintrin.h> 144 /// 145 /// \code 146 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) 147 /// \endcode 148 /// 149 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 150 /// 151 /// \param __W 152 /// A 256-bit vector of [8 x int]. 153 /// \param __A 154 /// A 256-bit vector of [16 x short]. 155 /// \param __B 156 /// A 256-bit vector of [16 x unsigned short]. 157 /// \returns 158 /// A 256-bit vector of [8 x int]. 159 /// 160 /// \code{.operation} 161 /// FOR j := 0 to 7 162 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 163 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 164 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 165 /// ENDFOR 166 /// dst[MAX:256] := 0 167 /// \endcode 168 static __inline__ __m256i __DEFAULT_FN_ATTRS256 169 _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { 170 return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A, 171 (__v8si)__B); 172 } 173 174 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 175 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 176 /// signed 16-bit results. Sum these 2 results with the corresponding 177 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 178 /// 179 /// \headerfile <immintrin.h> 180 /// 181 /// \code 182 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B) 183 /// \endcode 184 /// 185 /// This intrinsic corresponds to the \c VPDPWUSD instruction. 186 /// 187 /// \param __W 188 /// A 128-bit vector of [4 x int]. 189 /// \param __A 190 /// A 128-bit vector of [8 x unsigned short]. 191 /// \param __B 192 /// A 128-bit vector of [8 x short]. 193 /// \returns 194 /// A 128-bit vector of [4 x int]. 195 /// 196 /// \code{.operation} 197 /// FOR j := 0 to 3 198 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 199 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 200 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 201 /// ENDFOR 202 /// dst[MAX:128] := 0 203 /// \endcode 204 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, 205 __m128i __A, 206 __m128i __B) { 207 return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A, 208 (__v4si)__B); 209 } 210 211 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 212 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 213 /// signed 16-bit results. Sum these 2 results with the corresponding 214 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 215 /// 216 /// \headerfile <immintrin.h> 217 /// 218 /// \code 219 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) 220 /// \endcode 221 /// 222 /// This intrinsic corresponds to the \c VPDPWUSD instruction. 223 /// 224 /// \param __W 225 /// A 256-bit vector of [8 x int]. 226 /// \param __A 227 /// A 256-bit vector of [16 x unsigned short]. 228 /// \param __B 229 /// A 256-bit vector of [16 x short]. 230 /// \returns 231 /// A 256-bit vector of [8 x int]. 232 /// 233 /// \code{.operation} 234 /// FOR j := 0 to 7 235 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 236 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 237 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 238 /// ENDFOR 239 /// dst[MAX:256] := 0 240 /// \endcode 241 static __inline__ __m256i __DEFAULT_FN_ATTRS256 242 _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { 243 return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A, 244 (__v8si)__B); 245 } 246 247 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 248 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 249 /// signed 16-bit results. Sum these 2 results with the corresponding 250 /// 32-bit integer in \a __W with signed saturation, and store the packed 251 /// 32-bit results in \a dst. 252 /// 253 /// \headerfile <immintrin.h> 254 /// 255 /// \code 256 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B) 257 /// \endcode 258 /// 259 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 260 /// 261 /// \param __W 262 /// A 128-bit vector of [4 x int]. 263 /// \param __A 264 /// A 128-bit vector of [8 x unsigned short]. 265 /// \param __B 266 /// A 128-bit vector of [8 x short]. 267 /// \returns 268 /// A 128-bit vector of [4 x int]. 269 /// 270 /// \code{.operation} 271 /// FOR j := 0 to 3 272 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 273 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 274 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 275 /// ENDFOR 276 /// dst[MAX:128] := 0 277 /// \endcode 278 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, 279 __m128i __A, 280 __m128i __B) { 281 return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A, 282 (__v4si)__B); 283 } 284 285 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 286 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 287 /// signed 16-bit results. Sum these 2 results with the corresponding 288 /// 32-bit integer in \a __W with signed saturation, and store the packed 289 /// 32-bit results in \a dst. 290 /// 291 /// \headerfile <immintrin.h> 292 /// 293 /// \code 294 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) 295 /// \endcode 296 /// 297 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 298 /// 299 /// \param __W 300 /// A 256-bit vector of [8 x int]. 301 /// \param __A 302 /// A 256-bit vector of [16 x unsigned short]. 303 /// \param __B 304 /// A 256-bit vector of [16 x short]. 305 /// \returns 306 /// A 256-bit vector of [8 x int]. 307 /// 308 /// \code{.operation} 309 /// FOR j := 0 to 7 310 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 311 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 312 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 313 /// ENDFOR 314 /// dst[MAX:256] := 0 315 /// \endcode 316 static __inline__ __m256i __DEFAULT_FN_ATTRS256 317 _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { 318 return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A, 319 (__v8si)__B); 320 } 321 322 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 323 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 324 /// signed 16-bit results. Sum these 2 results with the corresponding 325 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 326 /// 327 /// \headerfile <immintrin.h> 328 /// 329 /// \code 330 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B) 331 /// \endcode 332 /// 333 /// This intrinsic corresponds to the \c VPDPWUUD instruction. 334 /// 335 /// \param __W 336 /// A 128-bit vector of [4 x unsigned int]. 337 /// \param __A 338 /// A 128-bit vector of [8 x unsigned short]. 339 /// \param __B 340 /// A 128-bit vector of [8 x unsigned short]. 341 /// \returns 342 /// A 128-bit vector of [4 x unsigned int]. 343 /// 344 /// \code{.operation} 345 /// FOR j := 0 to 3 346 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 347 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 348 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 349 /// ENDFOR 350 /// dst[MAX:128] := 0 351 /// \endcode 352 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, 353 __m128i __A, 354 __m128i __B) { 355 return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A, 356 (__v4si)__B); 357 } 358 359 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 360 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 361 /// signed 16-bit results. Sum these 2 results with the corresponding 362 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 363 /// 364 /// \headerfile <immintrin.h> 365 /// 366 /// \code 367 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) 368 /// \endcode 369 /// 370 /// This intrinsic corresponds to the \c VPDPWUUD instruction. 371 /// 372 /// \param __W 373 /// A 256-bit vector of [8 x unsigned int]. 374 /// \param __A 375 /// A 256-bit vector of [16 x unsigned short]. 376 /// \param __B 377 /// A 256-bit vector of [16 x unsigned short]. 378 /// \returns 379 /// A 256-bit vector of [8 x unsigned int]. 380 /// 381 /// \code{.operation} 382 /// FOR j := 0 to 7 383 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 384 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 385 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 386 /// ENDFOR 387 /// dst[MAX:256] := 0 388 /// \endcode 389 static __inline__ __m256i __DEFAULT_FN_ATTRS256 390 _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { 391 return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A, 392 (__v8si)__B); 393 } 394 395 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 396 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 397 /// signed 16-bit results. Sum these 2 results with the corresponding 398 /// 32-bit integer in \a __W with signed saturation, and store the packed 399 /// 32-bit results in \a dst. 400 /// 401 /// \headerfile <immintrin.h> 402 /// 403 /// \code 404 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) 405 /// \endcode 406 /// 407 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 408 /// 409 /// \param __W 410 /// A 128-bit vector of [4 x unsigned int]. 411 /// \param __A 412 /// A 128-bit vector of [8 x unsigned short]. 413 /// \param __B 414 /// A 128-bit vector of [8 x unsigned short]. 415 /// \returns 416 /// A 128-bit vector of [4 x unsigned int]. 417 /// 418 /// \code{.operation} 419 /// FOR j := 0 to 3 420 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 421 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 423 /// ENDFOR 424 /// dst[MAX:128] := 0 425 /// \endcode 426 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, 427 __m128i __A, 428 __m128i __B) { 429 return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A, 430 (__v4si)__B); 431 } 432 433 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 434 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 435 /// signed 16-bit results. Sum these 2 results with the corresponding 436 /// 32-bit integer in \a __W with signed saturation, and store the packed 437 /// 32-bit results in \a dst. 438 /// 439 /// \headerfile <immintrin.h> 440 /// 441 /// \code 442 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) 443 /// \endcode 444 /// 445 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 446 /// 447 /// \param __W 448 /// A 256-bit vector of [8 x unsigned int]. 449 /// \param __A 450 /// A 256-bit vector of [16 x unsigned short]. 451 /// \param __B 452 /// A 256-bit vector of [16 x unsigned short]. 453 /// \returns 454 /// A 256-bit vector of [8 x unsigned int]. 455 /// 456 /// \code{.operation} 457 /// FOR j := 0 to 7 458 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 459 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 460 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 461 /// ENDFOR 462 /// dst[MAX:256] := 0 463 /// \endcode 464 static __inline__ __m256i __DEFAULT_FN_ATTRS256 465 _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { 466 return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A, 467 (__v8si)__B); 468 } 469 470 #undef __DEFAULT_FN_ATTRS128 471 #undef __DEFAULT_FN_ATTRS256 472 473 #endif // __AVXVNNIINT16INTRIN_H 474