1 /*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------=== 2 * 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a copy 5 * of this software and associated documentation files (the "Software"), to deal 6 * in the Software without restriction, including without limitation the rights 7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 * copies of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 * THE SOFTWARE. 21 * 22 *===-----------------------------------------------------------------------=== 23 */ 24 #ifndef __IMMINTRIN_H 25 #error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef __AVXVNNIINTRIN_H 29 #define __AVXVNNIINTRIN_H 30 31 /* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */ 32 /// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) 33 /// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) 34 /// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) 35 /// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) 36 /// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) 37 /// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) 38 /// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) 39 /// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) 40 41 /* Intrinsics with _avx_ prefix are for compatibility with msvc. */ 42 /* Define the default attributes for the functions in this file. */ 43 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256))) 44 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128))) 45 46 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 47 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed 48 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 49 /// in \a __S, and store the packed 32-bit results in DST. 50 /// 51 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 52 /// 53 /// \code{.operation} 54 /// FOR j := 0 to 7 55 /// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) 56 /// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) 57 /// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) 58 /// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) 59 /// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 60 /// ENDFOR 61 /// DST[MAX:256] := 0 62 /// \endcode 63 static __inline__ __m256i __DEFAULT_FN_ATTRS256 64 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) 65 { 66 return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B); 67 } 68 69 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 70 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed 71 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 72 /// in \a __S using signed saturation, and store the packed 32-bit results in DST. 73 /// 74 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 75 /// 76 /// \code{.operation} 77 /// FOR j := 0 to 7 78 /// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) 79 /// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) 80 /// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) 81 /// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) 82 /// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 83 /// ENDFOR 84 /// DST[MAX:256] := 0 85 /// \endcode 86 static __inline__ __m256i __DEFAULT_FN_ATTRS256 87 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) 88 { 89 return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B); 90 } 91 92 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 93 /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit 94 /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, 95 /// and store the packed 32-bit results in DST. 96 /// 97 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 98 /// 99 /// \code{.operation} 100 /// FOR j := 0 to 7 101 /// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 102 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 103 /// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 104 /// ENDFOR 105 /// DST[MAX:256] := 0 106 /// \endcode 107 static __inline__ __m256i __DEFAULT_FN_ATTRS256 108 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) 109 { 110 return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); 111 } 112 113 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 114 /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit 115 /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S 116 /// using signed saturation, and store the packed 32-bit results in DST. 117 /// 118 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 119 /// 120 /// \code{.operation} 121 /// FOR j := 0 to 7 122 /// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 123 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 124 /// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) 125 /// ENDFOR 126 /// DST[MAX:256] := 0 127 /// \endcode 128 static __inline__ __m256i __DEFAULT_FN_ATTRS256 129 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) 130 { 131 return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); 132 } 133 134 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 135 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed 136 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 137 /// in \a __S, and store the packed 32-bit results in DST. 138 /// 139 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. 140 /// 141 /// \code{.operation} 142 /// FOR j := 0 to 3 143 /// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) 144 /// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) 145 /// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) 146 /// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) 147 /// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 148 /// ENDFOR 149 /// DST[MAX:128] := 0 150 /// \endcode 151 static __inline__ __m128i __DEFAULT_FN_ATTRS128 152 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) 153 { 154 return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B); 155 } 156 157 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 158 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed 159 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer 160 /// in \a __S using signed saturation, and store the packed 32-bit results in DST. 161 /// 162 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. 163 /// 164 /// \code{.operation} 165 /// FOR j := 0 to 3 166 /// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) 167 /// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) 168 /// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) 169 /// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) 170 /// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 171 /// ENDFOR 172 /// DST[MAX:128] := 0 173 /// \endcode 174 static __inline__ __m128i __DEFAULT_FN_ATTRS128 175 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) 176 { 177 return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B); 178 } 179 180 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 181 /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit 182 /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, 183 /// and store the packed 32-bit results in DST. 184 /// 185 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. 186 /// 187 /// \code{.operation} 188 /// FOR j := 0 to 3 189 /// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 190 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 191 /// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 192 /// ENDFOR 193 /// DST[MAX:128] := 0 194 /// \endcode 195 static __inline__ __m128i __DEFAULT_FN_ATTRS128 196 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) 197 { 198 return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); 199 } 200 201 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 202 /// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit 203 /// results. Sum these 2 results with the corresponding 32-bit integer in \a __S 204 /// using signed saturation, and store the packed 32-bit results in DST. 205 /// 206 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. 207 /// 208 /// \code{.operation} 209 /// FOR j := 0 to 3 210 /// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 211 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 212 /// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) 213 /// ENDFOR 214 /// DST[MAX:128] := 0 215 /// \endcode 216 static __inline__ __m128i __DEFAULT_FN_ATTRS128 217 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) 218 { 219 return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); 220 } 221 222 #undef __DEFAULT_FN_ATTRS128 223 #undef __DEFAULT_FN_ATTRS256 224 225 #endif // __AVXVNNIINTRIN_H 226