xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx512vlbf16intrin.h (revision a03411e84728e9b267056fd31c7d1d9d1dc1b01e)
1 /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512VLBF16INTRIN_H
16 #define __AVX512VLBF16INTRIN_H
17 
18 #define __DEFAULT_FN_ATTRS128 \
19   __attribute__((__always_inline__, __nodebug__, \
20                  __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
21 #define __DEFAULT_FN_ATTRS256 \
22   __attribute__((__always_inline__, __nodebug__, \
23                  __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
24 
25 /// Convert Two Packed Single Data to One Packed BF16 Data.
26 ///
27 /// \headerfile <x86intrin.h>
28 ///
29 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
30 ///
31 /// \param __A
32 ///    A 128-bit vector of [4 x float].
33 /// \param __B
34 ///    A 128-bit vector of [4 x float].
35 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
36 ///    conversion of __B, and higher 64 bits come from conversion of __A.
37 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
38 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
39   return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
40                                                     (__v4sf) __B);
41 }
42 
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48 ///
49 /// \param __A
50 ///    A 128-bit vector of [4 x float].
51 /// \param __B
52 ///    A 128-bit vector of [4 x float].
53 /// \param __W
54 ///    A 128-bit vector of [8 x bfloat].
55 /// \param __U
56 ///    A 8-bit mask value specifying what is chosen for each element.
57 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
58 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
59 ///    conversion of __B, and higher 64 bits come from conversion of __A.
60 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
61 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
62   return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
63                                              (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
64                                              (__v8bf)__W);
65 }
66 
67 /// Convert Two Packed Single Data to One Packed BF16 Data.
68 ///
69 /// \headerfile <x86intrin.h>
70 ///
71 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
72 ///
73 /// \param __A
74 ///    A 128-bit vector of [4 x float].
75 /// \param __B
76 ///    A 128-bit vector of [4 x float].
77 /// \param __U
78 ///    A 8-bit mask value specifying what is chosen for each element.
79 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
80 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
81 ///    conversion of __B, and higher 64 bits come from conversion of __A.
82 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
83 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
84   return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
85                                              (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
86                                              (__v8bf)_mm_setzero_si128());
87 }
88 
89 /// Convert Two Packed Single Data to One Packed BF16 Data.
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
94 ///
95 /// \param __A
96 ///    A 256-bit vector of [8 x float].
97 /// \param __B
98 ///    A 256-bit vector of [8 x float].
99 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
100 ///    conversion of __B, and higher 128 bits come from conversion of __A.
101 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
102 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
103   return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
104                                                     (__v8sf) __B);
105 }
106 
107 /// Convert Two Packed Single Data to One Packed BF16 Data.
108 ///
109 /// \headerfile <x86intrin.h>
110 ///
111 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
112 ///
113 /// \param __A
114 ///    A 256-bit vector of [8 x float].
115 /// \param __B
116 ///    A 256-bit vector of [8 x float].
117 /// \param __W
118 ///    A 256-bit vector of [16 x bfloat].
119 /// \param __U
120 ///    A 16-bit mask value specifying what is chosen for each element.
121 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
122 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
123 ///    conversion of __B, and higher 128 bits come from conversion of __A.
124 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
125 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
126   return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
127                                          (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
128                                          (__v16bf)__W);
129 }
130 
131 /// Convert Two Packed Single Data to One Packed BF16 Data.
132 ///
133 /// \headerfile <x86intrin.h>
134 ///
135 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
136 ///
137 /// \param __A
138 ///    A 256-bit vector of [8 x float].
139 /// \param __B
140 ///    A 256-bit vector of [8 x float].
141 /// \param __U
142 ///    A 16-bit mask value specifying what is chosen for each element.
143 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
144 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
145 ///    conversion of __B, and higher 128 bits come from conversion of __A.
146 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
147 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
148   return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
149                                          (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
150                                          (__v16bf)_mm256_setzero_si256());
151 }
152 
153 /// Convert Packed Single Data to Packed BF16 Data.
154 ///
155 /// \headerfile <x86intrin.h>
156 ///
157 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
158 ///
159 /// \param __A
160 ///    A 128-bit vector of [4 x float].
161 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
162 ///    conversion of __A, and higher 64 bits are 0.
163 #define _mm_cvtneps_pbh(A)                                                     \
164   ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
165 
166 /// Convert Packed Single Data to Packed BF16 Data.
167 ///
168 /// \headerfile <x86intrin.h>
169 ///
170 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
171 ///
172 /// \param __A
173 ///    A 128-bit vector of [4 x float].
174 /// \param __W
175 ///    A 128-bit vector of [8 x bfloat].
176 /// \param __U
177 ///    A 4-bit mask value specifying what is chosen for each element.
178 ///    A 1 means conversion of __A. A 0 means element from __W.
179 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
180 ///    conversion of __A, and higher 64 bits are 0.
181 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
182 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
183   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
184                                                         (__v8bf)__W,
185                                                         (__mmask8)__U);
186 }
187 
188 /// Convert Packed Single Data to Packed BF16 Data.
189 ///
190 /// \headerfile <x86intrin.h>
191 ///
192 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
193 ///
194 /// \param __A
195 ///    A 128-bit vector of [4 x float].
196 /// \param __U
197 ///    A 4-bit mask value specifying what is chosen for each element.
198 ///    A 1 means conversion of __A. A 0 means element is zero.
199 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
200 ///    conversion of __A, and higher 64 bits are 0.
201 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
202 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
203   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
204                                                     (__v8bf)_mm_setzero_si128(),
205                                                     (__mmask8)__U);
206 }
207 
208 /// Convert Packed Single Data to Packed BF16 Data.
209 ///
210 /// \headerfile <x86intrin.h>
211 ///
212 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
213 ///
214 /// \param __A
215 ///    A 256-bit vector of [8 x float].
216 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
217 #define _mm256_cvtneps_pbh(A)                                                  \
218   ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
219 
220 /// Convert Packed Single Data to Packed BF16 Data.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
225 ///
226 /// \param __A
227 ///    A 256-bit vector of [8 x float].
228 /// \param __W
229 ///    A 256-bit vector of [8 x bfloat].
230 /// \param __U
231 ///    A 8-bit mask value specifying what is chosen for each element.
232 ///    A 1 means conversion of __A. A 0 means element from __W.
233 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
234 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
235 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
236   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
237                                                         (__v8bf)__W,
238                                                         (__mmask8)__U);
239 }
240 
241 /// Convert Packed Single Data to Packed BF16 Data.
242 ///
243 /// \headerfile <x86intrin.h>
244 ///
245 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
246 ///
247 /// \param __A
248 ///    A 256-bit vector of [8 x float].
249 /// \param __U
250 ///    A 8-bit mask value specifying what is chosen for each element.
251 ///    A 1 means conversion of __A. A 0 means element is zero.
252 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
253 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
254 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
255   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
256                                                     (__v8bf)_mm_setzero_si128(),
257                                                     (__mmask8)__U);
258 }
259 
260 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
265 ///
266 /// \param __A
267 ///    A 128-bit vector of [8 x bfloat].
268 /// \param __B
269 ///    A 128-bit vector of [8 x bfloat].
270 /// \param __D
271 ///    A 128-bit vector of [4 x float].
272 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
273 ///  __A, __B and __D
274 static __inline__ __m128 __DEFAULT_FN_ATTRS128
275 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
276   return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
277                                              (__v8bf)__A,
278                                              (__v8bf)__B);
279 }
280 
281 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
282 ///
283 /// \headerfile <x86intrin.h>
284 ///
285 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
286 ///
287 /// \param __A
288 ///    A 128-bit vector of [8 x bfloat].
289 /// \param __B
290 ///    A 128-bit vector of [8 x bfloat].
291 /// \param __D
292 ///    A 128-bit vector of [4 x float].
293 /// \param __U
294 ///    A 8-bit mask value specifying what is chosen for each element.
295 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
296 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
297 ///  __A, __B and __D
298 static __inline__ __m128 __DEFAULT_FN_ATTRS128
299 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
300   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
301                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
302                                            (__v4sf)__D);
303 }
304 
305 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
306 ///
307 /// \headerfile <x86intrin.h>
308 ///
309 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
310 ///
311 /// \param __A
312 ///    A 128-bit vector of [8 x bfloat].
313 /// \param __B
314 ///    A 128-bit vector of [8 x bfloat].
315 /// \param __D
316 ///    A 128-bit vector of [4 x float].
317 /// \param __U
318 ///    A 8-bit mask value specifying what is chosen for each element.
319 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
320 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
321 ///  __A, __B and __D
322 static __inline__ __m128 __DEFAULT_FN_ATTRS128
323 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
324   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
325                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
326                                            (__v4sf)_mm_setzero_si128());
327 }
328 
329 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
330 ///
331 /// \headerfile <x86intrin.h>
332 ///
333 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
334 ///
335 /// \param __A
336 ///    A 256-bit vector of [16 x bfloat].
337 /// \param __B
338 ///    A 256-bit vector of [16 x bfloat].
339 /// \param __D
340 ///    A 256-bit vector of [8 x float].
341 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
342 ///  __A, __B and __D
343 static __inline__ __m256 __DEFAULT_FN_ATTRS256
344 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
345   return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
346                                              (__v16bf)__A,
347                                              (__v16bf)__B);
348 }
349 
350 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
351 ///
352 /// \headerfile <x86intrin.h>
353 ///
354 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
355 ///
356 /// \param __A
357 ///    A 256-bit vector of [16 x bfloat].
358 /// \param __B
359 ///    A 256-bit vector of [16 x bfloat].
360 /// \param __D
361 ///    A 256-bit vector of [8 x float].
362 /// \param __U
363 ///    A 16-bit mask value specifying what is chosen for each element.
364 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
365 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
366 ///  __A, __B and __D
367 static __inline__ __m256 __DEFAULT_FN_ATTRS256
368 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
369   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
370                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
371                                         (__v8sf)__D);
372 }
373 
374 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
375 ///
376 /// \headerfile <x86intrin.h>
377 ///
378 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
379 ///
380 /// \param __A
381 ///    A 256-bit vector of [16 x bfloat].
382 /// \param __B
383 ///    A 256-bit vector of [16 x bfloat].
384 /// \param __D
385 ///    A 256-bit vector of [8 x float].
386 /// \param __U
387 ///    A 8-bit mask value specifying what is chosen for each element.
388 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
389 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
390 ///  __A, __B and __D
391 static __inline__ __m256 __DEFAULT_FN_ATTRS256
392 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
393   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
394                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
395                                         (__v8sf)_mm256_setzero_si256());
396 }
397 
398 /// Convert One Single float Data to One BF16 Data.
399 ///
400 /// \headerfile <x86intrin.h>
401 ///
402 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
403 ///
404 /// \param __A
405 ///    A float data.
406 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
407 ///    and fraction field is truncated to 7 bits.
408 static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
409   __v4sf __V = {__A, 0, 0, 0};
410   __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
411       (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
412   return (__bf16)__R[0];
413 }
414 
415 /// Convert Packed BF16 Data to Packed float Data.
416 ///
417 /// \headerfile <x86intrin.h>
418 ///
419 /// \param __A
420 ///    A 128-bit vector of [4 x bfloat].
421 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
422 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
423   return _mm_castsi128_ps(
424       (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
425 }
426 
427 /// Convert Packed BF16 Data to Packed float Data.
428 ///
429 /// \headerfile <x86intrin.h>
430 ///
431 /// \param __A
432 ///    A 128-bit vector of [8 x bfloat].
433 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
434 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
435   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
436       (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
437 }
438 
439 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
440 ///
441 /// \headerfile <x86intrin.h>
442 ///
443 /// \param __U
444 ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
445 ///    bit is not set.
446 /// \param __A
447 ///    A 128-bit vector of [4 x bfloat].
448 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
449 static __inline__ __m128 __DEFAULT_FN_ATTRS128
450 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
451   return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
452       (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
453 }
454 
455 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
456 ///
457 /// \headerfile <x86intrin.h>
458 ///
459 /// \param __U
460 ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
461 ///    bit is not set.
462 /// \param __A
463 ///    A 128-bit vector of [8 x bfloat].
464 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
465 static __inline__ __m256 __DEFAULT_FN_ATTRS256
466 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
467   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
468       (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
469 }
470 
471 /// Convert Packed BF16 Data to Packed float Data using merging mask.
472 ///
473 /// \headerfile <x86intrin.h>
474 ///
475 /// \param __S
476 ///    A 128-bit vector of [4 x float]. Elements are copied from __S when
477 ///     the corresponding mask bit is not set.
478 /// \param __U
479 ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
480 ///    bit is not set.
481 /// \param __A
482 ///    A 128-bit vector of [4 x bfloat].
483 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
484 static __inline__ __m128 __DEFAULT_FN_ATTRS128
485 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
486   return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
487       (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
488       16));
489 }
490 
491 /// Convert Packed BF16 Data to Packed float Data using merging mask.
492 ///
493 /// \headerfile <x86intrin.h>
494 ///
495 /// \param __S
496 ///    A 256-bit vector of [8 x float]. Elements are copied from __S when
497 ///     the corresponding mask bit is not set.
498 /// \param __U
499 ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
500 ///    bit is not set.
501 /// \param __A
502 ///    A 128-bit vector of [8 x bfloat].
503 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
504 static __inline__ __m256 __DEFAULT_FN_ATTRS256
505 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
506   return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
507       (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
508       16));
509 }
510 
511 #undef __DEFAULT_FN_ATTRS128
512 #undef __DEFAULT_FN_ATTRS256
513 
514 #endif
515 #endif
516