10b57cec5SDimitry Andric /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------=== 20b57cec5SDimitry Andric * 30b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric * 70b57cec5SDimitry Andric *===-----------------------------------------------------------------------=== 80b57cec5SDimitry Andric */ 90b57cec5SDimitry Andric #ifndef __IMMINTRIN_H 100b57cec5SDimitry Andric #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead." 110b57cec5SDimitry Andric #endif 120b57cec5SDimitry Andric 13bdd1243dSDimitry Andric #ifdef __SSE2__ 14bdd1243dSDimitry Andric 150b57cec5SDimitry Andric #ifndef __AVX512BF16INTRIN_H 160b57cec5SDimitry Andric #define __AVX512BF16INTRIN_H 170b57cec5SDimitry Andric 18bdd1243dSDimitry Andric typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64))); 19bdd1243dSDimitry Andric typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64))); 20bdd1243dSDimitry Andric typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS512 \ 23*5f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \ 240b57cec5SDimitry Andric __min_vector_width__(512))) 250b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS \ 26*5f757f3fSDimitry Andric __attribute__((__always_inline__, __nodebug__, \ 27*5f757f3fSDimitry Andric __target__("avx512bf16,no-evex512"))) 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric /// Convert One BF16 Data to One Single Float Data. 300b57cec5SDimitry Andric /// 310b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 320b57cec5SDimitry Andric /// 330b57cec5SDimitry Andric /// This intrinsic does not correspond to a specific instruction. 340b57cec5SDimitry Andric /// 350b57cec5SDimitry Andric /// \param __A 360b57cec5SDimitry Andric /// A bfloat data. 370b57cec5SDimitry Andric /// \returns A float data whose sign field and exponent field keep unchanged, 380b57cec5SDimitry Andric /// and fraction field is extended to 23 bits. 39bdd1243dSDimitry Andric static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { 400b57cec5SDimitry Andric return __builtin_ia32_cvtsbf162ss_32(__A); 410b57cec5SDimitry Andric } 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric /// Convert Two Packed Single Data to One Packed BF16 Data. 440b57cec5SDimitry Andric /// 450b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 460b57cec5SDimitry Andric /// 470b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 480b57cec5SDimitry Andric /// 490b57cec5SDimitry Andric /// \param __A 500b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 510b57cec5SDimitry Andric /// \param __B 520b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 530b57cec5SDimitry Andric /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 540b57cec5SDimitry Andric /// conversion of __B, and higher 256 bits come from conversion of __A. 550b57cec5SDimitry Andric static __inline__ __m512bh __DEFAULT_FN_ATTRS512 560b57cec5SDimitry Andric _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { 570b57cec5SDimitry Andric return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A, 580b57cec5SDimitry Andric (__v16sf) __B); 590b57cec5SDimitry Andric } 600b57cec5SDimitry Andric 610b57cec5SDimitry Andric /// Convert Two Packed Single Data to One Packed BF16 Data. 620b57cec5SDimitry Andric /// 630b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 640b57cec5SDimitry Andric /// 650b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 660b57cec5SDimitry Andric /// 670b57cec5SDimitry Andric /// \param __A 680b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 690b57cec5SDimitry Andric /// \param __B 700b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 710b57cec5SDimitry Andric /// \param __W 720b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 730b57cec5SDimitry Andric /// \param __U 740b57cec5SDimitry Andric /// A 32-bit mask value specifying what is chosen for each element. 750b57cec5SDimitry Andric /// A 1 means conversion of __A or __B. A 0 means element from __W. 760b57cec5SDimitry Andric /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 770b57cec5SDimitry Andric /// conversion of __B, and higher 256 bits come from conversion of __A. 780b57cec5SDimitry Andric static __inline__ __m512bh __DEFAULT_FN_ATTRS512 790b57cec5SDimitry Andric _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { 80bdd1243dSDimitry Andric return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, 81bdd1243dSDimitry Andric (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), 82bdd1243dSDimitry Andric (__v32bf)__W); 830b57cec5SDimitry Andric } 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric /// Convert Two Packed Single Data to One Packed BF16 Data. 860b57cec5SDimitry Andric /// 870b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 880b57cec5SDimitry Andric /// 890b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions. 900b57cec5SDimitry Andric /// 910b57cec5SDimitry Andric /// \param __A 920b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 930b57cec5SDimitry Andric /// \param __B 940b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 950b57cec5SDimitry Andric /// \param __U 960b57cec5SDimitry Andric /// A 32-bit mask value specifying what is chosen for each element. 970b57cec5SDimitry Andric /// A 1 means conversion of __A or __B. A 0 means element is zero. 980b57cec5SDimitry Andric /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from 990b57cec5SDimitry Andric /// conversion of __B, and higher 256 bits come from conversion of __A. 1000b57cec5SDimitry Andric static __inline__ __m512bh __DEFAULT_FN_ATTRS512 1010b57cec5SDimitry Andric _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { 102bdd1243dSDimitry Andric return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, 103bdd1243dSDimitry Andric (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), 104bdd1243dSDimitry Andric (__v32bf)_mm512_setzero_si512()); 1050b57cec5SDimitry Andric } 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric /// Convert Packed Single Data to Packed BF16 Data. 1080b57cec5SDimitry Andric /// 1090b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1100b57cec5SDimitry Andric /// 1110b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 1120b57cec5SDimitry Andric /// 1130b57cec5SDimitry Andric /// \param __A 1140b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 1150b57cec5SDimitry Andric /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 1160b57cec5SDimitry Andric static __inline__ __m256bh __DEFAULT_FN_ATTRS512 1170b57cec5SDimitry Andric _mm512_cvtneps_pbh(__m512 __A) { 1180b57cec5SDimitry Andric return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 119bdd1243dSDimitry Andric (__v16bf)_mm256_undefined_si256(), 1200b57cec5SDimitry Andric (__mmask16)-1); 1210b57cec5SDimitry Andric } 1220b57cec5SDimitry Andric 1230b57cec5SDimitry Andric /// Convert Packed Single Data to Packed BF16 Data. 1240b57cec5SDimitry Andric /// 1250b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1260b57cec5SDimitry Andric /// 1270b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 1280b57cec5SDimitry Andric /// 1290b57cec5SDimitry Andric /// \param __A 1300b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 1310b57cec5SDimitry Andric /// \param __W 1320b57cec5SDimitry Andric /// A 256-bit vector of [16 x bfloat]. 1330b57cec5SDimitry Andric /// \param __U 1340b57cec5SDimitry Andric /// A 16-bit mask value specifying what is chosen for each element. 1350b57cec5SDimitry Andric /// A 1 means conversion of __A. A 0 means element from __W. 1360b57cec5SDimitry Andric /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 1370b57cec5SDimitry Andric static __inline__ __m256bh __DEFAULT_FN_ATTRS512 1380b57cec5SDimitry Andric _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { 1390b57cec5SDimitry Andric return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 140bdd1243dSDimitry Andric (__v16bf)__W, 1410b57cec5SDimitry Andric (__mmask16)__U); 1420b57cec5SDimitry Andric } 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric /// Convert Packed Single Data to Packed BF16 Data. 1450b57cec5SDimitry Andric /// 1460b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1470b57cec5SDimitry Andric /// 1480b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions. 1490b57cec5SDimitry Andric /// 1500b57cec5SDimitry Andric /// \param __A 1510b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 1520b57cec5SDimitry Andric /// \param __U 1530b57cec5SDimitry Andric /// A 16-bit mask value specifying what is chosen for each element. 1540b57cec5SDimitry Andric /// A 1 means conversion of __A. A 0 means element is zero. 1550b57cec5SDimitry Andric /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. 1560b57cec5SDimitry Andric static __inline__ __m256bh __DEFAULT_FN_ATTRS512 1570b57cec5SDimitry Andric _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { 1580b57cec5SDimitry Andric return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, 159bdd1243dSDimitry Andric (__v16bf)_mm256_setzero_si256(), 1600b57cec5SDimitry Andric (__mmask16)__U); 1610b57cec5SDimitry Andric } 1620b57cec5SDimitry Andric 1630b57cec5SDimitry Andric /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 1640b57cec5SDimitry Andric /// 1650b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1660b57cec5SDimitry Andric /// 1670b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 1680b57cec5SDimitry Andric /// 1690b57cec5SDimitry Andric /// \param __A 1700b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 1710b57cec5SDimitry Andric /// \param __B 1720b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 1730b57cec5SDimitry Andric /// \param __D 1740b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 1750b57cec5SDimitry Andric /// \returns A 512-bit vector of [16 x float] comes from Dot Product of 1760b57cec5SDimitry Andric /// __A, __B and __D 1770b57cec5SDimitry Andric static __inline__ __m512 __DEFAULT_FN_ATTRS512 1780b57cec5SDimitry Andric _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { 1790b57cec5SDimitry Andric return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D, 180bdd1243dSDimitry Andric (__v32bf) __A, 181bdd1243dSDimitry Andric (__v32bf) __B); 1820b57cec5SDimitry Andric } 1830b57cec5SDimitry Andric 1840b57cec5SDimitry Andric /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 1850b57cec5SDimitry Andric /// 1860b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 1870b57cec5SDimitry Andric /// 1880b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 1890b57cec5SDimitry Andric /// 1900b57cec5SDimitry Andric /// \param __A 1910b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 1920b57cec5SDimitry Andric /// \param __B 1930b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 1940b57cec5SDimitry Andric /// \param __D 1950b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 1960b57cec5SDimitry Andric /// \param __U 1970b57cec5SDimitry Andric /// A 16-bit mask value specifying what is chosen for each element. 1980b57cec5SDimitry Andric /// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. 1990b57cec5SDimitry Andric /// \returns A 512-bit vector of [16 x float] comes from Dot Product of 2000b57cec5SDimitry Andric /// __A, __B and __D 2010b57cec5SDimitry Andric static __inline__ __m512 __DEFAULT_FN_ATTRS512 2020b57cec5SDimitry Andric _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { 2030b57cec5SDimitry Andric return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2040b57cec5SDimitry Andric (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), 2050b57cec5SDimitry Andric (__v16sf)__D); 2060b57cec5SDimitry Andric } 2070b57cec5SDimitry Andric 2080b57cec5SDimitry Andric /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 2090b57cec5SDimitry Andric /// 2100b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2110b57cec5SDimitry Andric /// 2120b57cec5SDimitry Andric /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions. 2130b57cec5SDimitry Andric /// 2140b57cec5SDimitry Andric /// \param __A 2150b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 2160b57cec5SDimitry Andric /// \param __B 2170b57cec5SDimitry Andric /// A 512-bit vector of [32 x bfloat]. 2180b57cec5SDimitry Andric /// \param __D 2190b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. 2200b57cec5SDimitry Andric /// \param __U 2210b57cec5SDimitry Andric /// A 16-bit mask value specifying what is chosen for each element. 2220b57cec5SDimitry Andric /// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. 2230b57cec5SDimitry Andric /// \returns A 512-bit vector of [16 x float] comes from Dot Product of 2240b57cec5SDimitry Andric /// __A, __B and __D 2250b57cec5SDimitry Andric static __inline__ __m512 __DEFAULT_FN_ATTRS512 2260b57cec5SDimitry Andric _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { 2270b57cec5SDimitry Andric return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 2280b57cec5SDimitry Andric (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), 2290b57cec5SDimitry Andric (__v16sf)_mm512_setzero_si512()); 2300b57cec5SDimitry Andric } 2310b57cec5SDimitry Andric 2320b57cec5SDimitry Andric /// Convert Packed BF16 Data to Packed float Data. 2330b57cec5SDimitry Andric /// 2340b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2350b57cec5SDimitry Andric /// 2360b57cec5SDimitry Andric /// \param __A 2370b57cec5SDimitry Andric /// A 256-bit vector of [16 x bfloat]. 238349cc55cSDimitry Andric /// \returns A 512-bit vector of [16 x float] come from conversion of __A 2390b57cec5SDimitry Andric static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { 2400b57cec5SDimitry Andric return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( 2410b57cec5SDimitry Andric (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); 2420b57cec5SDimitry Andric } 2430b57cec5SDimitry Andric 2440b57cec5SDimitry Andric /// Convert Packed BF16 Data to Packed float Data using zeroing mask. 2450b57cec5SDimitry Andric /// 2460b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2470b57cec5SDimitry Andric /// 2480b57cec5SDimitry Andric /// \param __U 2490b57cec5SDimitry Andric /// A 16-bit mask. Elements are zeroed out when the corresponding mask 2500b57cec5SDimitry Andric /// bit is not set. 2510b57cec5SDimitry Andric /// \param __A 2520b57cec5SDimitry Andric /// A 256-bit vector of [16 x bfloat]. 253349cc55cSDimitry Andric /// \returns A 512-bit vector of [16 x float] come from conversion of __A 2540b57cec5SDimitry Andric static __inline__ __m512 __DEFAULT_FN_ATTRS512 2550b57cec5SDimitry Andric _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { 2560b57cec5SDimitry Andric return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( 2570b57cec5SDimitry Andric (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); 2580b57cec5SDimitry Andric } 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric /// Convert Packed BF16 Data to Packed float Data using merging mask. 2610b57cec5SDimitry Andric /// 2620b57cec5SDimitry Andric /// \headerfile <x86intrin.h> 2630b57cec5SDimitry Andric /// 2640b57cec5SDimitry Andric /// \param __S 2650b57cec5SDimitry Andric /// A 512-bit vector of [16 x float]. Elements are copied from __S when 2660b57cec5SDimitry Andric /// the corresponding mask bit is not set. 2670b57cec5SDimitry Andric /// \param __U 2680b57cec5SDimitry Andric /// A 16-bit mask. 2690b57cec5SDimitry Andric /// \param __A 2700b57cec5SDimitry Andric /// A 256-bit vector of [16 x bfloat]. 271349cc55cSDimitry Andric /// \returns A 512-bit vector of [16 x float] come from conversion of __A 2720b57cec5SDimitry Andric static __inline__ __m512 __DEFAULT_FN_ATTRS512 2730b57cec5SDimitry Andric _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { 2740b57cec5SDimitry Andric return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( 2750b57cec5SDimitry Andric (__m512i)__S, (__mmask16)__U, 2760b57cec5SDimitry Andric (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); 2770b57cec5SDimitry Andric } 2780b57cec5SDimitry Andric 2790b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS 2800b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS512 2810b57cec5SDimitry Andric 2820b57cec5SDimitry Andric #endif 283bdd1243dSDimitry Andric #endif 284