1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error \ 11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVXVNNIINT8INTRIN_H 15 #define __AVXVNNIINT8INTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS256 \ 19 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ 20 __min_vector_width__(256))) 21 #define __DEFAULT_FN_ATTRS128 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ 23 __min_vector_width__(128))) 24 25 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 26 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 27 /// signed 16-bit results. Sum these 4 results with the corresponding 28 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 29 /// 30 /// \headerfile <x86intrin.h> 31 /// 32 /// \code 33 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B); 34 /// \endcode 35 /// 36 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 37 /// 38 /// \param __A 39 /// A 128-bit vector of [16 x char]. 40 /// \param __B 41 /// A 128-bit vector of [16 x char]. 42 /// \returns 43 /// A 128-bit vector of [4 x int]. 44 /// 45 /// \code{.operation} 46 /// FOR j := 0 to 3 47 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 48 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 49 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 50 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 51 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 52 /// ENDFOR 53 /// dst[MAX:128] := 0 54 /// \endcode 55 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, 56 __m128i __A, 57 __m128i __B) { 58 return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A, 59 (__v4si)__B); 60 } 61 62 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 63 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 64 /// signed 16-bit results. Sum these 4 results with the corresponding 65 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 66 /// 67 /// \headerfile <x86intrin.h> 68 /// 69 /// \code 70 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B); 71 /// \endcode 72 /// 73 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 74 /// 75 /// \param __A 76 /// A 256-bit vector of [32 x char]. 77 /// \param __B 78 /// A 256-bit vector of [32 x char]. 79 /// \returns 80 /// A 256-bit vector of [8 x int]. 81 /// 82 /// \code{.operation} 83 /// FOR j := 0 to 7 84 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 85 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 86 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 87 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 88 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 89 /// ENDFOR 90 /// dst[MAX:256] := 0 91 /// \endcode 92 static __inline__ __m256i __DEFAULT_FN_ATTRS256 93 _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { 94 return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A, 95 (__v8si)__B); 96 } 97 98 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 99 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 100 /// signed 16-bit results. Sum these 4 results with the corresponding 101 /// 32-bit integer in \a __W with signed saturation, and store the packed 102 /// 32-bit results in \a dst. 103 /// 104 /// \headerfile <x86intrin.h> 105 /// 106 /// \code 107 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B); 108 /// \endcode 109 /// 110 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 111 /// 112 /// \param __A 113 /// A 128-bit vector of [16 x char]. 114 /// \param __B 115 /// A 128-bit vector of [16 x char]. 116 /// \returns 117 /// A 128-bit vector of [4 x int]. 118 /// 119 /// \code{.operation} 120 /// FOR j := 0 to 3 121 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 122 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 123 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 124 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 125 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 126 /// ENDFOR 127 /// dst[MAX:128] := 0 128 /// \endcode 129 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, 130 __m128i __A, 131 __m128i __B) { 132 return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A, 133 (__v4si)__B); 134 } 135 136 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 137 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 138 /// signed 16-bit results. Sum these 4 results with the corresponding 139 /// 32-bit integer in \a __W with signed saturation, and store the packed 140 /// 32-bit results in \a dst. 141 /// 142 /// \headerfile <x86intrin.h> 143 /// 144 /// \code 145 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B); 146 /// \endcode 147 /// 148 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 149 /// 150 /// \param __A 151 /// A 256-bit vector of [32 x char]. 152 /// \param __B 153 /// A 256-bit vector of [32 x char]. 154 /// \returns 155 /// A 256-bit vector of [8 x int]. 156 /// 157 /// \code{.operation} 158 /// FOR j := 0 to 7 159 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 160 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 161 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 162 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 163 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 164 /// ENDFOR 165 /// dst[MAX:256] := 0 166 /// \endcode 167 static __inline__ __m256i __DEFAULT_FN_ATTRS256 168 _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { 169 return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A, 170 (__v8si)__B); 171 } 172 173 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 174 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 175 /// signed 16-bit results. Sum these 4 results with the corresponding 176 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 177 /// 178 /// \headerfile <x86intrin.h> 179 /// 180 /// \code 181 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B); 182 /// \endcode 183 /// 184 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 185 /// 186 /// \param __A 187 /// A 128-bit vector of [16 x char]. 188 /// \param __B 189 /// A 128-bit vector of [16 x unsigned char]. 190 /// \returns 191 /// A 128-bit vector of [4 x int]. 192 /// 193 /// \code{.operation} 194 /// FOR j := 0 to 3 195 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 196 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 197 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 198 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 199 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 200 /// ENDFOR 201 /// dst[MAX:128] := 0 202 /// \endcode 203 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, 204 __m128i __A, 205 __m128i __B) { 206 return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A, 207 (__v4si)__B); 208 } 209 210 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 211 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 212 /// signed 16-bit results. Sum these 4 results with the corresponding 213 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 214 /// 215 /// \headerfile <x86intrin.h> 216 /// 217 /// \code 218 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B); 219 /// \endcode 220 /// 221 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 222 /// 223 /// \param __A 224 /// A 256-bit vector of [32 x char]. 225 /// \param __B 226 /// A 256-bit vector of [32 x unsigned char]. 227 /// \returns 228 /// A 256-bit vector of [8 x int]. 229 /// 230 /// \code{.operation} 231 /// FOR j := 0 to 7 232 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 233 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 234 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 235 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 236 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 237 /// ENDFOR 238 /// dst[MAX:256] := 0 239 /// \endcode 240 static __inline__ __m256i __DEFAULT_FN_ATTRS256 241 _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { 242 return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A, 243 (__v8si)__B); 244 } 245 246 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 247 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 248 /// signed 16-bit results. Sum these 4 results with the corresponding 249 /// 32-bit integer in \a __W with signed saturation, and store the packed 250 /// 32-bit results in \a dst. 251 /// 252 /// \headerfile <x86intrin.h> 253 /// 254 /// \code 255 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B); 256 /// \endcode 257 /// 258 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 259 /// 260 /// \param __A 261 /// A 128-bit vector of [16 x char]. 262 /// \param __B 263 /// A 128-bit vector of [16 x unsigned char]. 264 /// \returns 265 /// A 128-bit vector of [4 x int]. 266 /// 267 /// \code{.operation} 268 /// FOR j := 0 to 3 269 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 270 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 271 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 272 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 273 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 274 /// ENDFOR 275 /// dst[MAX:128] := 0 276 /// \endcode 277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, 278 __m128i __A, 279 __m128i __B) { 280 return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A, 281 (__v4si)__B); 282 } 283 284 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 285 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 286 /// signed 16-bit results. Sum these 4 results with the corresponding 287 /// 32-bit integer in \a __W with signed saturation, and store the packed 288 /// 32-bit results in \a dst. 289 /// 290 /// \headerfile <x86intrin.h> 291 /// 292 /// \code 293 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B); 294 /// \endcode 295 /// 296 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 297 /// 298 /// \param __A 299 /// A 256-bit vector of [32 x char]. 300 /// \param __B 301 /// A 256-bit vector of [32 x unsigned char]. 302 /// \returns 303 /// A 256-bit vector of [8 x int]. 304 /// 305 /// \code{.operation} 306 /// FOR j := 0 to 7 307 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 308 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 309 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 310 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 311 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 312 /// ENDFOR 313 /// dst[MAX:256] := 0 314 /// \endcode 315 static __inline__ __m256i __DEFAULT_FN_ATTRS256 316 _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { 317 return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A, 318 (__v8si)__B); 319 } 320 321 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 322 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 323 /// signed 16-bit results. Sum these 4 results with the corresponding 324 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 325 /// 326 /// \headerfile <x86intrin.h> 327 /// 328 /// \code 329 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B); 330 /// \endcode 331 /// 332 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 333 /// 334 /// \param __A 335 /// A 128-bit vector of [16 x unsigned char]. 336 /// \param __B 337 /// A 128-bit vector of [16 x unsigned char]. 338 /// \returns 339 /// A 128-bit vector of [4 x int]. 340 /// 341 /// \code{.operation} 342 /// FOR j := 0 to 3 343 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 344 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 345 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 346 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 347 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 348 /// ENDFOR 349 /// dst[MAX:128] := 0 350 /// \endcode 351 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, 352 __m128i __A, 353 __m128i __B) { 354 return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A, 355 (__v4si)__B); 356 } 357 358 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 359 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 360 /// signed 16-bit results. Sum these 4 results with the corresponding 361 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 362 /// 363 /// \headerfile <x86intrin.h> 364 /// 365 /// \code 366 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B); 367 /// \endcode 368 /// 369 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 370 /// 371 /// \param __A 372 /// A 256-bit vector of [32 x unsigned char]. 373 /// \param __B 374 /// A 256-bit vector of [32 x unsigned char]. 375 /// \returns 376 /// A 256-bit vector of [8 x int]. 377 /// 378 /// \code{.operation} 379 /// FOR j := 0 to 7 380 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 381 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 382 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 383 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 384 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 385 /// ENDFOR 386 /// dst[MAX:256] := 0 387 /// \endcode 388 static __inline__ __m256i __DEFAULT_FN_ATTRS256 389 _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { 390 return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A, 391 (__v8si)__B); 392 } 393 394 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 395 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 396 /// signed 16-bit results. Sum these 4 results with the corresponding 397 /// 32-bit integer in \a __W with signed saturation, and store the packed 398 /// 32-bit results in \a dst. 399 /// 400 /// \headerfile <x86intrin.h> 401 /// 402 /// \code 403 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B); 404 /// \endcode 405 /// 406 /// This intrinsic corresponds to the \c VPDPBUUDS instruction. 407 /// 408 /// \param __A 409 /// A 128-bit vector of [16 x unsigned char]. 410 /// \param __B 411 /// A 128-bit vector of [16 x unsigned char]. 412 /// \returns 413 /// A 128-bit vector of [4 x int]. 414 /// 415 /// \code{.operation} 416 /// FOR j := 0 to 3 417 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 418 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 419 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 420 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 421 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 422 /// ENDFOR 423 /// dst[MAX:128] := 0 424 /// \endcode 425 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, 426 __m128i __A, 427 __m128i __B) { 428 return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A, 429 (__v4si)__B); 430 } 431 432 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 433 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 434 /// signed 16-bit results. Sum these 4 results with the corresponding 435 /// 32-bit integer in \a __W with signed saturation, and store the packed 436 /// 32-bit results in \a dst. 437 /// 438 /// \headerfile <x86intrin.h> 439 /// 440 /// \code 441 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B); 442 /// \endcode 443 /// 444 /// This intrinsic corresponds to the \c VPDPBUUDS instruction. 445 /// 446 /// \param __A 447 /// A 256-bit vector of [32 x unsigned char]. 448 /// \param __B 449 /// A 256-bit vector of [32 x unsigned char]. 450 /// \returns 451 /// A 256-bit vector of [8 x int]. 452 /// 453 /// \code{.operation} 454 /// FOR j := 0 to 7 455 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 456 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 457 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 458 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 459 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 460 /// ENDFOR 461 /// dst[MAX:256] := 0 462 /// \endcode 463 static __inline__ __m256i __DEFAULT_FN_ATTRS256 464 _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { 465 return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A, 466 (__v8si)__B); 467 } 468 #undef __DEFAULT_FN_ATTRS128 469 #undef __DEFAULT_FN_ATTRS256 470 471 #endif // __AVXVNNIINT8INTRIN_H 472