1 /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== 2 * 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 * 8 *===-----------------------------------------------------------------------=== 9 */ 10 #ifndef __IMMINTRIN_H 11 #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVX512VLVNNIINTRIN_H 15 #define __AVX512VLVNNIINTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) 19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) 20 21 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 22 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 23 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 24 /// in \a S, and store the packed 32-bit results in DST. 25 /// 26 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 27 /// 28 /// \operation 29 /// FOR j := 0 to 7 30 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 31 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 32 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 33 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 34 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 35 /// ENDFOR 36 /// DST[MAX:256] := 0 37 /// \endoperation 38 #define _mm256_dpbusd_epi32(S, A, B) \ 39 ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 40 41 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 42 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 43 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 44 /// in \a S using signed saturation, and store the packed 32-bit results in DST. 45 /// 46 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 47 /// 48 /// \operation 49 /// FOR j := 0 to 7 50 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 51 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 52 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 53 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 54 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 55 /// ENDFOR 56 /// DST[MAX:256] := 0 57 /// \endoperation 58 #define _mm256_dpbusds_epi32(S, A, B) \ 59 ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 60 61 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 62 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 63 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 64 /// and store the packed 32-bit results in DST. 65 /// 66 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 67 /// 68 /// \operation 69 /// FOR j := 0 to 7 70 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 71 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 72 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 73 /// ENDFOR 74 /// DST[MAX:256] := 0 75 /// \endoperation 76 #define _mm256_dpwssd_epi32(S, A, B) \ 77 ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 78 79 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 80 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 81 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S 82 /// using signed saturation, and store the packed 32-bit results in DST. 83 /// 84 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 85 /// 86 /// \operation 87 /// FOR j := 0 to 7 88 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 89 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 90 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 91 /// ENDFOR 92 /// DST[MAX:256] := 0 93 /// \endoperation 94 #define _mm256_dpwssds_epi32(S, A, B) \ 95 ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) 96 97 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 98 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 99 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 100 /// in \a S, and store the packed 32-bit results in DST. 101 /// 102 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 103 /// 104 /// \operation 105 /// FOR j := 0 to 3 106 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 107 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 108 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 109 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 110 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 111 /// ENDFOR 112 /// DST[MAX:128] := 0 113 /// \endoperation 114 #define _mm_dpbusd_epi32(S, A, B) \ 115 ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 116 117 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with 118 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed 119 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 120 /// in \a S using signed saturation, and store the packed 32-bit results in DST. 121 /// 122 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 123 /// 124 /// \operation 125 /// FOR j := 0 to 3 126 /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) 127 /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) 128 /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) 129 /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) 130 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 131 /// ENDFOR 132 /// DST[MAX:128] := 0 133 /// \endoperation 134 #define _mm_dpbusds_epi32(S, A, B) \ 135 ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 136 137 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 138 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 139 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, 140 /// and store the packed 32-bit results in DST. 141 /// 142 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 143 /// 144 /// \operation 145 /// FOR j := 0 to 3 146 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 147 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 148 /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 149 /// ENDFOR 150 /// DST[MAX:128] := 0 151 /// \endoperation 152 #define _mm_dpwssd_epi32(S, A, B) \ 153 ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 154 155 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with 156 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit 157 /// results. Sum these 2 results with the corresponding 32-bit integer in \a S 158 /// using signed saturation, and store the packed 32-bit results in DST. 159 /// 160 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 161 /// 162 /// \operation 163 /// FOR j := 0 to 3 164 /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) 165 /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) 166 /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) 167 /// ENDFOR 168 /// DST[MAX:128] := 0 169 /// \endoperation 170 #define _mm_dpwssds_epi32(S, A, B) \ 171 ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) 172 173 static __inline__ __m256i __DEFAULT_FN_ATTRS256 174 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 175 { 176 return (__m256i)__builtin_ia32_selectd_256(__U, 177 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 178 (__v8si)__S); 179 } 180 181 static __inline__ __m256i __DEFAULT_FN_ATTRS256 182 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 183 { 184 return (__m256i)__builtin_ia32_selectd_256(__U, 185 (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), 186 (__v8si)_mm256_setzero_si256()); 187 } 188 189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 190 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 191 { 192 return (__m256i)__builtin_ia32_selectd_256(__U, 193 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 194 (__v8si)__S); 195 } 196 197 static __inline__ __m256i __DEFAULT_FN_ATTRS256 198 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 199 { 200 return (__m256i)__builtin_ia32_selectd_256(__U, 201 (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), 202 (__v8si)_mm256_setzero_si256()); 203 } 204 205 static __inline__ __m256i __DEFAULT_FN_ATTRS256 206 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 207 { 208 return (__m256i)__builtin_ia32_selectd_256(__U, 209 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 210 (__v8si)__S); 211 } 212 213 static __inline__ __m256i __DEFAULT_FN_ATTRS256 214 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 215 { 216 return (__m256i)__builtin_ia32_selectd_256(__U, 217 (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), 218 (__v8si)_mm256_setzero_si256()); 219 } 220 221 static __inline__ __m256i __DEFAULT_FN_ATTRS256 222 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) 223 { 224 return (__m256i)__builtin_ia32_selectd_256(__U, 225 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 226 (__v8si)__S); 227 } 228 229 static __inline__ __m256i __DEFAULT_FN_ATTRS256 230 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) 231 { 232 return (__m256i)__builtin_ia32_selectd_256(__U, 233 (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), 234 (__v8si)_mm256_setzero_si256()); 235 } 236 237 static __inline__ __m128i __DEFAULT_FN_ATTRS128 238 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 239 { 240 return (__m128i)__builtin_ia32_selectd_128(__U, 241 (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 242 (__v4si)__S); 243 } 244 245 static __inline__ __m128i __DEFAULT_FN_ATTRS128 246 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 247 { 248 return (__m128i)__builtin_ia32_selectd_128(__U, 249 (__v4si)_mm_dpbusd_epi32(__S, __A, __B), 250 (__v4si)_mm_setzero_si128()); 251 } 252 253 static __inline__ __m128i __DEFAULT_FN_ATTRS128 254 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 255 { 256 return (__m128i)__builtin_ia32_selectd_128(__U, 257 (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 258 (__v4si)__S); 259 } 260 261 static __inline__ __m128i __DEFAULT_FN_ATTRS128 262 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 263 { 264 return (__m128i)__builtin_ia32_selectd_128(__U, 265 (__v4si)_mm_dpbusds_epi32(__S, __A, __B), 266 (__v4si)_mm_setzero_si128()); 267 } 268 269 static __inline__ __m128i __DEFAULT_FN_ATTRS128 270 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 271 { 272 return (__m128i)__builtin_ia32_selectd_128(__U, 273 (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 274 (__v4si)__S); 275 } 276 277 static __inline__ __m128i __DEFAULT_FN_ATTRS128 278 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 279 { 280 return (__m128i)__builtin_ia32_selectd_128(__U, 281 (__v4si)_mm_dpwssd_epi32(__S, __A, __B), 282 (__v4si)_mm_setzero_si128()); 283 } 284 285 static __inline__ __m128i __DEFAULT_FN_ATTRS128 286 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) 287 { 288 return (__m128i)__builtin_ia32_selectd_128(__U, 289 (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 290 (__v4si)__S); 291 } 292 293 static __inline__ __m128i __DEFAULT_FN_ATTRS128 294 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) 295 { 296 return (__m128i)__builtin_ia32_selectd_128(__U, 297 (__v4si)_mm_dpwssds_epi32(__S, __A, __B), 298 (__v4si)_mm_setzero_si128()); 299 } 300 301 #undef __DEFAULT_FN_ATTRS128 302 #undef __DEFAULT_FN_ATTRS256 303 304 #endif 305