1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error \ 12 "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead." 13 #endif // __IMMINTRIN_H 14 15 #ifdef __SSE2__ 16 17 #ifndef __AVXNECONVERTINTRIN_H 18 #define __AVXNECONVERTINTRIN_H 19 20 /* Define the default attributes for the functions in this file. */ 21 #define __DEFAULT_FN_ATTRS128 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ 23 __min_vector_width__(128))) 24 #define __DEFAULT_FN_ATTRS256 \ 25 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ 26 __min_vector_width__(256))) 27 28 /// Convert scalar BF16 (16-bit) floating-point element 29 /// stored at memory locations starting at location \a __A to a 30 /// single-precision (32-bit) floating-point, broadcast it to packed 31 /// single-precision (32-bit) floating-point elements, and store the results in 32 /// \a dst. 33 /// 34 /// \headerfile <x86intrin.h> 35 /// 36 /// \code 37 /// _mm_bcstnebf16_ps(const void *__A); 38 /// \endcode 39 /// 40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction. 41 /// 42 /// \param __A 43 /// A pointer to a 16-bit memory location. The address of the memory 44 /// location does not have to be aligned. 45 /// \returns 46 /// A 128-bit vector of [4 x float]. 47 /// 48 /// \code{.operation} 49 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A]) 50 /// FOR j := 0 to 3 51 /// m := j*32 52 /// dst[m+31:m] := b 53 /// ENDFOR 54 /// dst[MAX:128] := 0 55 /// \endcode 56 static __inline__ __m128 __DEFAULT_FN_ATTRS128 57 _mm_bcstnebf16_ps(const void *__A) { 58 return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A); 59 } 60 61 /// Convert scalar BF16 (16-bit) floating-point element 62 /// stored at memory locations starting at location \a __A to a 63 /// single-precision (32-bit) floating-point, broadcast it to packed 64 /// single-precision (32-bit) floating-point elements, and store the results in 65 /// \a dst. 66 /// 67 /// \headerfile <x86intrin.h> 68 /// 69 /// \code 70 /// _mm256_bcstnebf16_ps(const void *__A); 71 /// \endcode 72 /// 73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction. 74 /// 75 /// \param __A 76 /// A pointer to a 16-bit memory location. The address of the memory 77 /// location does not have to be aligned. 78 /// \returns 79 /// A 256-bit vector of [8 x float]. 80 /// 81 /// \code{.operation} 82 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A]) 83 /// FOR j := 0 to 7 84 /// m := j*32 85 /// dst[m+31:m] := b 86 /// ENDFOR 87 /// dst[MAX:256] := 0 88 /// \endcode 89 static __inline__ __m256 __DEFAULT_FN_ATTRS256 90 _mm256_bcstnebf16_ps(const void *__A) { 91 return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A); 92 } 93 94 /// Convert scalar half-precision (16-bit) floating-point element 95 /// stored at memory locations starting at location \a __A to a 96 /// single-precision (32-bit) floating-point, broadcast it to packed 97 /// single-precision (32-bit) floating-point elements, and store the results in 98 /// \a dst. 99 /// 100 /// \headerfile <x86intrin.h> 101 /// 102 /// \code 103 /// _mm_bcstnesh_ps(const void *__A); 104 /// \endcode 105 /// 106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction. 107 /// 108 /// \param __A 109 /// A pointer to a 16-bit memory location. The address of the memory 110 /// location does not have to be aligned. 111 /// \returns 112 /// A 128-bit vector of [4 x float]. 113 /// 114 /// \code{.operation} 115 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A]) 116 /// FOR j := 0 to 3 117 /// m := j*32 118 /// dst[m+31:m] := b 119 /// ENDFOR 120 /// dst[MAX:128] := 0 121 /// \endcode 122 static __inline__ __m128 __DEFAULT_FN_ATTRS128 123 _mm_bcstnesh_ps(const void *__A) { 124 return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A); 125 } 126 127 /// Convert scalar half-precision (16-bit) floating-point element 128 /// stored at memory locations starting at location \a __A to a 129 /// single-precision (32-bit) floating-point, broadcast it to packed 130 /// single-precision (32-bit) floating-point elements, and store the results in 131 /// \a dst. 132 /// 133 /// \headerfile <x86intrin.h> 134 /// 135 /// \code 136 /// _mm256_bcstnesh_ps(const void *__A); 137 /// \endcode 138 /// 139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction. 140 /// 141 /// \param __A 142 /// A pointer to a 16-bit memory location. The address of the memory 143 /// location does not have to be aligned. 144 /// \returns 145 /// A 256-bit vector of [8 x float]. 146 /// 147 /// \code{.operation} 148 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A]) 149 /// FOR j := 0 to 7 150 /// m := j*32 151 /// dst[m+31:m] := b 152 /// ENDFOR 153 /// dst[MAX:256] := 0 154 /// \endcode 155 static __inline__ __m256 __DEFAULT_FN_ATTRS256 156 _mm256_bcstnesh_ps(const void *__A) { 157 return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A); 158 } 159 160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements 161 /// stored at memory locations starting at location \a __A to packed 162 /// single-precision (32-bit) floating-point elements, and store the results in 163 /// \a dst. 164 /// 165 /// \headerfile <x86intrin.h> 166 /// 167 /// \code 168 /// _mm_cvtneebf16_ps(const __m128bh *__A); 169 /// \endcode 170 /// 171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction. 172 /// 173 /// \param __A 174 /// A pointer to a 128-bit memory location containing 8 consecutive 175 /// BF16 (16-bit) floating-point values. 176 /// \returns 177 /// A 128-bit vector of [4 x float]. 178 /// 179 /// \code{.operation} 180 /// FOR j := 0 to 3 181 /// k := j*2 182 /// i := k*16 183 /// m := j*32 184 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 185 /// ENDFOR 186 /// dst[MAX:128] := 0 187 /// \endcode 188 static __inline__ __m128 __DEFAULT_FN_ATTRS128 189 _mm_cvtneebf16_ps(const __m128bh *__A) { 190 return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A); 191 } 192 193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements 194 /// stored at memory locations starting at location \a __A to packed 195 /// single-precision (32-bit) floating-point elements, and store the results in 196 /// \a dst. 197 /// 198 /// \headerfile <x86intrin.h> 199 /// 200 /// \code 201 /// _mm256_cvtneebf16_ps(const __m256bh *__A); 202 /// \endcode 203 /// 204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction. 205 /// 206 /// \param __A 207 /// A pointer to a 256-bit memory location containing 16 consecutive 208 /// BF16 (16-bit) floating-point values. 209 /// \returns 210 /// A 256-bit vector of [8 x float]. 211 /// 212 /// \code{.operation} 213 /// FOR j := 0 to 7 214 /// k := j*2 215 /// i := k*16 216 /// m := j*32 217 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 218 /// ENDFOR 219 /// dst[MAX:256] := 0 220 /// \endcode 221 static __inline__ __m256 __DEFAULT_FN_ATTRS256 222 _mm256_cvtneebf16_ps(const __m256bh *__A) { 223 return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A); 224 } 225 226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements 227 /// stored at memory locations starting at location \a __A to packed 228 /// single-precision (32-bit) floating-point elements, and store the results in 229 /// \a dst. 230 /// 231 /// \headerfile <x86intrin.h> 232 /// 233 /// \code 234 /// _mm_cvtneeph_ps(const __m128h *__A); 235 /// \endcode 236 /// 237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction. 238 /// 239 /// \param __A 240 /// A pointer to a 128-bit memory location containing 8 consecutive 241 /// half-precision (16-bit) floating-point values. 242 /// \returns 243 /// A 128-bit vector of [4 x float]. 244 /// 245 /// \code{.operation} 246 /// FOR j := 0 to 3 247 /// k := j*2 248 /// i := k*16 249 /// m := j*32 250 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 251 /// ENDFOR 252 /// dst[MAX:128] := 0 253 /// \endcode 254 static __inline__ __m128 __DEFAULT_FN_ATTRS128 255 _mm_cvtneeph_ps(const __m128h *__A) { 256 return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A); 257 } 258 259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements 260 /// stored at memory locations starting at location \a __A to packed 261 /// single-precision (32-bit) floating-point elements, and store the results in 262 /// \a dst. 263 /// 264 /// \headerfile <x86intrin.h> 265 /// 266 /// \code 267 /// _mm256_cvtneeph_ps(const __m256h *__A); 268 /// \endcode 269 /// 270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction. 271 /// 272 /// \param __A 273 /// A pointer to a 256-bit memory location containing 16 consecutive 274 /// half-precision (16-bit) floating-point values. 275 /// \returns 276 /// A 256-bit vector of [8 x float]. 277 /// 278 /// \code{.operation} 279 /// FOR j := 0 to 7 280 /// k := j*2 281 /// i := k*16 282 /// m := j*32 283 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 284 /// ENDFOR 285 /// dst[MAX:256] := 0 286 /// \endcode 287 static __inline__ __m256 __DEFAULT_FN_ATTRS256 288 _mm256_cvtneeph_ps(const __m256h *__A) { 289 return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A); 290 } 291 292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements 293 /// stored at memory locations starting at location \a __A to packed 294 /// single-precision (32-bit) floating-point elements, and store the results in 295 /// \a dst. 296 /// 297 /// \headerfile <x86intrin.h> 298 /// 299 /// \code 300 /// _mm_cvtneobf16_ps(const __m128bh *__A); 301 /// \endcode 302 /// 303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction. 304 /// 305 /// \param __A 306 /// A pointer to a 128-bit memory location containing 8 consecutive 307 /// BF16 (16-bit) floating-point values. 308 /// \returns 309 /// A 128-bit vector of [4 x float]. 310 /// 311 /// \code{.operation} 312 /// FOR j := 0 to 3 313 /// k := j*2+1 314 /// i := k*16 315 /// m := j*32 316 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 317 /// ENDFOR 318 /// dst[MAX:128] := 0 319 /// \endcode 320 static __inline__ __m128 __DEFAULT_FN_ATTRS128 321 _mm_cvtneobf16_ps(const __m128bh *__A) { 322 return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A); 323 } 324 325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements 326 /// stored at memory locations starting at location \a __A to packed 327 /// single-precision (32-bit) floating-point elements, and store the results in 328 /// \a dst. 329 /// 330 /// \headerfile <x86intrin.h> 331 /// 332 /// \code 333 /// _mm256_cvtneobf16_ps(const __m256bh *__A); 334 /// \endcode 335 /// 336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction. 337 /// 338 /// \param __A 339 /// A pointer to a 256-bit memory location containing 16 consecutive 340 /// BF16 (16-bit) floating-point values. 341 /// \returns 342 /// A 256-bit vector of [8 x float]. 343 /// 344 /// \code{.operation} 345 /// FOR j := 0 to 7 346 /// k := j*2+1 347 /// i := k*16 348 /// m := j*32 349 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) 350 /// ENDFOR 351 /// dst[MAX:256] := 0 352 /// \endcode 353 static __inline__ __m256 __DEFAULT_FN_ATTRS256 354 _mm256_cvtneobf16_ps(const __m256bh *__A) { 355 return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A); 356 } 357 358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements 359 /// stored at memory locations starting at location \a __A to packed 360 /// single-precision (32-bit) floating-point elements, and store the results in 361 /// \a dst. 362 /// 363 /// \headerfile <x86intrin.h> 364 /// 365 /// \code 366 /// _mm_cvtneoph_ps(const __m128h *__A); 367 /// \endcode 368 /// 369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction. 370 /// 371 /// \param __A 372 /// A pointer to a 128-bit memory location containing 8 consecutive 373 /// half-precision (16-bit) floating-point values. 374 /// \returns 375 /// A 128-bit vector of [4 x float]. 376 /// 377 /// \code{.operation} 378 /// FOR j := 0 to 3 379 /// k := j*2+1 380 /// i := k*16 381 /// m := j*32 382 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 383 /// ENDFOR 384 /// dst[MAX:128] := 0 385 /// \endcode 386 static __inline__ __m128 __DEFAULT_FN_ATTRS128 387 _mm_cvtneoph_ps(const __m128h *__A) { 388 return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A); 389 } 390 391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements 392 /// stored at memory locations starting at location \a __A to packed 393 /// single-precision (32-bit) floating-point elements, and store the results in 394 /// \a dst. 395 /// 396 /// \headerfile <x86intrin.h> 397 /// 398 /// \code 399 /// _mm256_cvtneoph_ps(const __m256h *__A); 400 /// \endcode 401 /// 402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction. 403 /// 404 /// \param __A 405 /// A pointer to a 256-bit memory location containing 16 consecutive 406 /// half-precision (16-bit) floating-point values. 407 /// \returns 408 /// A 256-bit vector of [8 x float]. 409 /// 410 /// \code{.operation} 411 /// FOR j := 0 to 7 412 /// k := j*2+1 413 /// i := k*16 414 /// m := j*32 415 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) 416 /// ENDFOR 417 /// dst[MAX:256] := 0 418 /// \endcode 419 static __inline__ __m256 __DEFAULT_FN_ATTRS256 420 _mm256_cvtneoph_ps(const __m256h *__A) { 421 return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A); 422 } 423 424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A 425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a 426 /// dst. 427 /// 428 /// \headerfile <x86intrin.h> 429 /// 430 /// \code 431 /// _mm_cvtneps_avx_pbh(__m128 __A); 432 /// \endcode 433 /// 434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction. 435 /// 436 /// \param __A 437 /// A 128-bit vector of [4 x float]. 438 /// \returns 439 /// A 128-bit vector of [8 x bfloat]. 440 /// 441 /// \code{.operation} 442 /// FOR j := 0 to 3 443 /// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) 444 /// ENDFOR 445 /// dst[MAX:128] := 0 446 /// \endcode 447 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 448 _mm_cvtneps_avx_pbh(__m128 __A) { 449 return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A); 450 } 451 452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A 453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a 454 /// dst. 455 /// 456 /// \headerfile <x86intrin.h> 457 /// 458 /// \code 459 /// _mm256_cvtneps_avx_pbh(__m256 __A); 460 /// \endcode 461 /// 462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction. 463 /// 464 /// \param __A 465 /// A 256-bit vector of [8 x float]. 466 /// \returns 467 /// A 128-bit vector of [8 x bfloat]. 468 /// 469 /// \code{.operation} 470 /// FOR j := 0 to 7 471 /// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) 472 /// ENDFOR 473 /// dst[MAX:128] := 0 474 /// \endcode 475 static __inline__ __m128bh __DEFAULT_FN_ATTRS256 476 _mm256_cvtneps_avx_pbh(__m256 __A) { 477 return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A); 478 } 479 480 #undef __DEFAULT_FN_ATTRS128 481 #undef __DEFAULT_FN_ATTRS256 482 483 #endif // __AVXNECONVERTINTRIN_H 484 #endif // __SSE2__ 485