1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifdef __SSE2__ 14 15 #ifndef __AVX512BF16INTRIN_H 16 #define __AVX512BF16INTRIN_H 17 18 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64))); 19 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64))); 20 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); 21 22 #define __DEFAULT_FN_ATTRS512 \ 23 __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \ 24 __min_vector_width__(512))) 25 #define __DEFAULT_FN_ATTRS \ 26 __attribute__((__always_inline__, __nodebug__, \ 27 __target__("avx512bf16,no-evex512"))) 28 29 /// Convert One BF16 Data to One Single Float Data. 30 /// 31 /// \headerfile <x86intrin.h> 32 /// 33 /// This intrinsic does not correspond to a specific instruction. 34 /// 35 /// \param __A 36 /// A bfloat data. 37 /// \returns A float data whose sign field and exponent field keep unchanged, 38 /// and fraction field is extended to 23 bits. 39 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { 40 return __builtin_ia32_cvtsbf162ss_32(__A); 41 } 42 43 /// Convert Two Packed Single Data to One Packed BF16 Data. 44 /// 45 /// \headerfile <x86intrin.h> 46 /// 47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 48 /// 49 /// \param __A 50 /// A 512-bit vector of [16 x float]. 51 /// \param __B 52 /// A 512-bit vector of [16 x float]. 53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 54 /// conversion of __B, and higher 256 bits come from conversion of __A. 55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 56 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { 57 return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A, 58 (__v16sf) __B); 59 } 60 61 /// Convert Two Packed Single Data to One Packed BF16 Data. 62 /// 63 /// \headerfile <x86intrin.h> 64 /// 65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 66 /// 67 /// \param __A 68 /// A 512-bit vector of [16 x float]. 69 /// \param __B 70 /// A 512-bit vector of [16 x float]. 71 /// \param __W 72 /// A 512-bit vector of [32 x bfloat]. 73 /// \param __U 74 /// A 32-bit mask value specifying what is chosen for each element. 75 /// A 1 means conversion of __A or __B. A 0 means element from __W. 76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 77 /// conversion of __B, and higher 256 bits come from conversion of __A. 78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 79 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { 80 return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, 81 (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), 82 (__v32bf)__W); 83 } 84 85 /// Convert Two Packed Single Data to One Packed BF16 Data. 86 /// 87 /// \headerfile <x86intrin.h> 88 /// 89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 90 /// 91 /// \param __A 92 /// A 512-bit vector of [16 x float]. 93 /// \param __B 94 /// A 512-bit vector of [16 x float]. 95 /// \param __U 96 /// A 32-bit mask value specifying what is chosen for each element. 97 /// A 1 means conversion of __A or __B. A 0 means element is zero. 98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 99 /// conversion of __B, and higher 256 bits come from conversion of __A. 100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { 102 return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, 103 (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), 104 (__v32bf)_mm512_setzero_si512()); 105 } 106 107 /// Convert Packed Single Data to Packed BF16 Data. 108 /// 109 /// \headerfile <x86intrin.h> 110 /// 111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 112 /// 113 /// \param __A 114 /// A 512-bit vector of [16 x float]. 115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512 117 _mm512_cvtneps_pbh(__m512 __A) { 118 return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 119 (__v16bf)_mm256_undefined_si256(), 120 (__mmask16)-1); 121 } 122 123 /// Convert Packed Single Data to Packed BF16 Data. 124 /// 125 /// \headerfile <x86intrin.h> 126 /// 127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 128 /// 129 /// \param __A 130 /// A 512-bit vector of [16 x float]. 131 /// \param __W 132 /// A 256-bit vector of [16 x bfloat]. 133 /// \param __U 134 /// A 16-bit mask value specifying what is chosen for each element. 135 /// A 1 means conversion of __A. A 0 means element from __W. 136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512 138 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { 139 return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 140 (__v16bf)__W, 141 (__mmask16)__U); 142 } 143 144 /// Convert Packed Single Data to Packed BF16 Data. 145 /// 146 /// \headerfile <x86intrin.h> 147 /// 148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 149 /// 150 /// \param __A 151 /// A 512-bit vector of [16 x float]. 152 /// \param __U 153 /// A 16-bit mask value specifying what is chosen for each element. 154 /// A 1 means conversion of __A. A 0 means element is zero. 155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512 157 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { 158 return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 159 (__v16bf)_mm256_setzero_si256(), 160 (__mmask16)__U); 161 } 162 163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 164 /// 165 /// \headerfile <x86intrin.h> 166 /// 167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 168 /// 169 /// \param __A 170 /// A 512-bit vector of [32 x bfloat]. 171 /// \param __B 172 /// A 512-bit vector of [32 x bfloat]. 173 /// \param __D 174 /// A 512-bit vector of [16 x float]. 175 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of 176 /// __A, __B and __D 177 static __inline__ __m512 __DEFAULT_FN_ATTRS512 178 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { 179 return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D, 180 (__v32bf) __A, 181 (__v32bf) __B); 182 } 183 184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 185 /// 186 /// \headerfile <x86intrin.h> 187 /// 188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 189 /// 190 /// \param __A 191 /// A 512-bit vector of [32 x bfloat]. 192 /// \param __B 193 /// A 512-bit vector of [32 x bfloat]. 194 /// \param __D 195 /// A 512-bit vector of [16 x float]. 196 /// \param __U 197 /// A 16-bit mask value specifying what is chosen for each element. 198 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 199 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of 200 /// __A, __B and __D 201 static __inline__ __m512 __DEFAULT_FN_ATTRS512 202 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { 203 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 204 (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), 205 (__v16sf)__D); 206 } 207 208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 209 /// 210 /// \headerfile <x86intrin.h> 211 /// 212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 213 /// 214 /// \param __A 215 /// A 512-bit vector of [32 x bfloat]. 216 /// \param __B 217 /// A 512-bit vector of [32 x bfloat]. 218 /// \param __D 219 /// A 512-bit vector of [16 x float]. 220 /// \param __U 221 /// A 16-bit mask value specifying what is chosen for each element. 222 /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 223 /// \returns A 512-bit vector of [16 x float] comes from Dot Product of 224 /// __A, __B and __D 225 static __inline__ __m512 __DEFAULT_FN_ATTRS512 226 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { 227 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 228 (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), 229 (__v16sf)_mm512_setzero_si512()); 230 } 231 232 /// Convert Packed BF16 Data to Packed float Data. 233 /// 234 /// \headerfile <x86intrin.h> 235 /// 236 /// \param __A 237 /// A 256-bit vector of [16 x bfloat]. 238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A 239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { 240 return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( 241 (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); 242 } 243 244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask. 245 /// 246 /// \headerfile <x86intrin.h> 247 /// 248 /// \param __U 249 /// A 16-bit mask. Elements are zeroed out when the corresponding mask 250 /// bit is not set. 251 /// \param __A 252 /// A 256-bit vector of [16 x bfloat]. 253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A 254 static __inline__ __m512 __DEFAULT_FN_ATTRS512 255 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { 256 return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( 257 (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); 258 } 259 260 /// Convert Packed BF16 Data to Packed float Data using merging mask. 261 /// 262 /// \headerfile <x86intrin.h> 263 /// 264 /// \param __S 265 /// A 512-bit vector of [16 x float]. Elements are copied from __S when 266 /// the corresponding mask bit is not set. 267 /// \param __U 268 /// A 16-bit mask. 269 /// \param __A 270 /// A 256-bit vector of [16 x bfloat]. 271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A 272 static __inline__ __m512 __DEFAULT_FN_ATTRS512 273 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { 274 return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( 275 (__m512i)__S, (__mmask16)__U, 276 (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); 277 } 278 279 #undef __DEFAULT_FN_ATTRS 280 #undef __DEFAULT_FN_ATTRS512 281 282 #endif 283 #endif 284