1 /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== 2 * 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 * 8 *===-----------------------------------------------------------------------=== 9 */ 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX512VLVNNIINTRIN_H 15 #define __AVX512VLVNNIINTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, \ 20 __target__("avx512vl,avx512vnni,no-evex512"), \ 21 __min_vector_width__(128))) 22 #define __DEFAULT_FN_ATTRS256 \ 23 __attribute__((__always_inline__, __nodebug__, \ 24 __target__("avx512vl,avx512vnni,no-evex512"), \ 25 __min_vector_width__(256))) 26 27 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 28 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 29 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 30 /// in \a S, and store the packed 32-bit results in DST. 31 /// 32 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 33 /// 34 /// \code{.operation} 35 /// FOR j := 0 to 7 36 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 37 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 38 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 39 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 40 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 41 /// ENDFOR 42 /// DST[MAX:256] := 0 43 /// \endcode 44 #define _mm256_dpbusd_epi32(S, A, B) \ 45 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 46 47 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 48 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 49 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 50 /// in \a S using signed saturation, and store the packed 32-bit results in DST. 51 /// 52 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 53 /// 54 /// \code{.operation} 55 /// FOR j := 0 to 7 56 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 57 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 58 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 59 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 60 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 61 /// ENDFOR 62 /// DST[MAX:256] := 0 63 /// \endcode 64 #define _mm256_dpbusds_epi32(S, A, B) \ 65 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 66 67 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 68 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 69 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 70 /// and store the packed 32-bit results in DST. 71 /// 72 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 73 /// 74 /// \code{.operation} 75 /// FOR j := 0 to 7 76 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 77 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 78 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 79 /// ENDFOR 80 /// DST[MAX:256] := 0 81 /// \endcode 82 #define _mm256_dpwssd_epi32(S, A, B) \ 83 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 84 85 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 86 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 87 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S 88 /// using signed saturation, and store the packed 32-bit results in DST. 89 /// 90 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 91 /// 92 /// \code{.operation} 93 /// FOR j := 0 to 7 94 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 95 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 96 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 97 /// ENDFOR 98 /// DST[MAX:256] := 0 99 /// \endcode 100 #define _mm256_dpwssds_epi32(S, A, B) \ 101 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 102 103 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 104 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 105 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 106 /// in \a S, and store the packed 32-bit results in DST. 107 /// 108 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 109 /// 110 /// \code{.operation} 111 /// FOR j := 0 to 3 112 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 113 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 114 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 115 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 116 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 117 /// ENDFOR 118 /// DST[MAX:128] := 0 119 /// \endcode 120 #define _mm_dpbusd_epi32(S, A, B) \ 121 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 122 123 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 124 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 125 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 126 /// in \a S using signed saturation, and store the packed 32-bit results in DST. 127 /// 128 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 129 /// 130 /// \code{.operation} 131 /// FOR j := 0 to 3 132 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 133 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 134 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 135 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 136 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 137 /// ENDFOR 138 /// DST[MAX:128] := 0 139 /// \endcode 140 #define _mm_dpbusds_epi32(S, A, B) \ 141 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 142 143 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 144 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 145 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 146 /// and store the packed 32-bit results in DST. 147 /// 148 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 149 /// 150 /// \code{.operation} 151 /// FOR j := 0 to 3 152 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 153 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 154 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 155 /// ENDFOR 156 /// DST[MAX:128] := 0 157 /// \endcode 158 #define _mm_dpwssd_epi32(S, A, B) \ 159 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 160 161 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 162 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 163 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S 164 /// using signed saturation, and store the packed 32-bit results in DST. 165 /// 166 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 167 /// 168 /// \code{.operation} 169 /// FOR j := 0 to 3 170 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 171 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 172 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 173 /// ENDFOR 174 /// DST[MAX:128] := 0 175 /// \endcode 176 #define _mm_dpwssds_epi32(S, A, B) \ 177 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 178 179 static __inline__ __m256i __DEFAULT_FN_ATTRS256 180 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 181 { 182 return (__m256i)__builtin_ia32_selectd_256(__U, 183 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 184 (__v8si)__S); 185 } 186 187 static __inline__ __m256i __DEFAULT_FN_ATTRS256 188 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 189 { 190 return (__m256i)__builtin_ia32_selectd_256(__U, 191 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 192 (__v8si)_mm256_setzero_si256()); 193 } 194 195 static __inline__ __m256i __DEFAULT_FN_ATTRS256 196 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 197 { 198 return (__m256i)__builtin_ia32_selectd_256(__U, 199 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 200 (__v8si)__S); 201 } 202 203 static __inline__ __m256i __DEFAULT_FN_ATTRS256 204 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 205 { 206 return (__m256i)__builtin_ia32_selectd_256(__U, 207 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 208 (__v8si)_mm256_setzero_si256()); 209 } 210 211 static __inline__ __m256i __DEFAULT_FN_ATTRS256 212 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 213 { 214 return (__m256i)__builtin_ia32_selectd_256(__U, 215 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 216 (__v8si)__S); 217 } 218 219 static __inline__ __m256i __DEFAULT_FN_ATTRS256 220 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 221 { 222 return (__m256i)__builtin_ia32_selectd_256(__U, 223 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 224 (__v8si)_mm256_setzero_si256()); 225 } 226 227 static __inline__ __m256i __DEFAULT_FN_ATTRS256 228 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 229 { 230 return (__m256i)__builtin_ia32_selectd_256(__U, 231 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 232 (__v8si)__S); 233 } 234 235 static __inline__ __m256i __DEFAULT_FN_ATTRS256 236 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 237 { 238 return (__m256i)__builtin_ia32_selectd_256(__U, 239 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 240 (__v8si)_mm256_setzero_si256()); 241 } 242 243 static __inline__ __m128i __DEFAULT_FN_ATTRS128 244 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 245 { 246 return (__m128i)__builtin_ia32_selectd_128(__U, 247 (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 248 (__v4si)__S); 249 } 250 251 static __inline__ __m128i __DEFAULT_FN_ATTRS128 252 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 253 { 254 return (__m128i)__builtin_ia32_selectd_128(__U, 255 (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 256 (__v4si)_mm_setzero_si128()); 257 } 258 259 static __inline__ __m128i __DEFAULT_FN_ATTRS128 260 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 261 { 262 return (__m128i)__builtin_ia32_selectd_128(__U, 263 (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 264 (__v4si)__S); 265 } 266 267 static __inline__ __m128i __DEFAULT_FN_ATTRS128 268 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 269 { 270 return (__m128i)__builtin_ia32_selectd_128(__U, 271 (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 272 (__v4si)_mm_setzero_si128()); 273 } 274 275 static __inline__ __m128i __DEFAULT_FN_ATTRS128 276 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 277 { 278 return (__m128i)__builtin_ia32_selectd_128(__U, 279 (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 280 (__v4si)__S); 281 } 282 283 static __inline__ __m128i __DEFAULT_FN_ATTRS128 284 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 285 { 286 return (__m128i)__builtin_ia32_selectd_128(__U, 287 (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 288 (__v4si)_mm_setzero_si128()); 289 } 290 291 static __inline__ __m128i __DEFAULT_FN_ATTRS128 292 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 293 { 294 return (__m128i)__builtin_ia32_selectd_128(__U, 295 (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 296 (__v4si)__S); 297 } 298 299 static __inline__ __m128i __DEFAULT_FN_ATTRS128 300 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 301 { 302 return (__m128i)__builtin_ia32_selectd_128(__U, 303 (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 304 (__v4si)_mm_setzero_si128()); 305 } 306 307 #undef __DEFAULT_FN_ATTRS128 308 #undef __DEFAULT_FN_ATTRS256 309 310 #endif 311