1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __IMMINTRIN_H
11 #error \
12 "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
14
15 #ifdef __SSE2__
16
17 #ifndef __AVXNECONVERTINTRIN_H
18 #define __AVXNECONVERTINTRIN_H
19
20 /* Define the default attributes for the functions in this file. */
21 #define __DEFAULT_FN_ATTRS128 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
23 __min_vector_width__(128)))
24 #define __DEFAULT_FN_ATTRS256 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
26 __min_vector_width__(256)))
27
28 /// Convert scalar BF16 (16-bit) floating-point element
29 /// stored at memory locations starting at location \a __A to a
30 /// single-precision (32-bit) floating-point, broadcast it to packed
31 /// single-precision (32-bit) floating-point elements, and store the results in
32 /// \a dst.
33 ///
34 /// \headerfile <x86intrin.h>
35 ///
36 /// \code
37 /// _mm_bcstnebf16_ps(const void *__A);
38 /// \endcode
39 ///
40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
41 ///
42 /// \param __A
43 /// A pointer to a 16-bit memory location. The address of the memory
44 /// location does not have to be aligned.
45 /// \returns
46 /// A 128-bit vector of [4 x float].
47 ///
48 /// \code{.operation}
49 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
50 /// FOR j := 0 to 3
51 /// m := j*32
52 /// dst[m+31:m] := b
53 /// ENDFOR
54 /// dst[MAX:128] := 0
55 /// \endcode
56 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnebf16_ps(const void * __A)57 _mm_bcstnebf16_ps(const void *__A) {
58 return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
59 }
60
61 /// Convert scalar BF16 (16-bit) floating-point element
62 /// stored at memory locations starting at location \a __A to a
63 /// single-precision (32-bit) floating-point, broadcast it to packed
64 /// single-precision (32-bit) floating-point elements, and store the results in
65 /// \a dst.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// \code
70 /// _mm256_bcstnebf16_ps(const void *__A);
71 /// \endcode
72 ///
73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
74 ///
75 /// \param __A
76 /// A pointer to a 16-bit memory location. The address of the memory
77 /// location does not have to be aligned.
78 /// \returns
79 /// A 256-bit vector of [8 x float].
80 ///
81 /// \code{.operation}
82 /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
83 /// FOR j := 0 to 7
84 /// m := j*32
85 /// dst[m+31:m] := b
86 /// ENDFOR
87 /// dst[MAX:256] := 0
88 /// \endcode
89 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnebf16_ps(const void * __A)90 _mm256_bcstnebf16_ps(const void *__A) {
91 return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
92 }
93
94 /// Convert scalar half-precision (16-bit) floating-point element
95 /// stored at memory locations starting at location \a __A to a
96 /// single-precision (32-bit) floating-point, broadcast it to packed
97 /// single-precision (32-bit) floating-point elements, and store the results in
98 /// \a dst.
99 ///
100 /// \headerfile <x86intrin.h>
101 ///
102 /// \code
103 /// _mm_bcstnesh_ps(const void *__A);
104 /// \endcode
105 ///
106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
107 ///
108 /// \param __A
109 /// A pointer to a 16-bit memory location. The address of the memory
110 /// location does not have to be aligned.
111 /// \returns
112 /// A 128-bit vector of [4 x float].
113 ///
114 /// \code{.operation}
115 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
116 /// FOR j := 0 to 3
117 /// m := j*32
118 /// dst[m+31:m] := b
119 /// ENDFOR
120 /// dst[MAX:128] := 0
121 /// \endcode
122 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnesh_ps(const void * __A)123 _mm_bcstnesh_ps(const void *__A) {
124 return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
125 }
126
127 /// Convert scalar half-precision (16-bit) floating-point element
128 /// stored at memory locations starting at location \a __A to a
129 /// single-precision (32-bit) floating-point, broadcast it to packed
130 /// single-precision (32-bit) floating-point elements, and store the results in
131 /// \a dst.
132 ///
133 /// \headerfile <x86intrin.h>
134 ///
135 /// \code
136 /// _mm256_bcstnesh_ps(const void *__A);
137 /// \endcode
138 ///
139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
140 ///
141 /// \param __A
142 /// A pointer to a 16-bit memory location. The address of the memory
143 /// location does not have to be aligned.
144 /// \returns
145 /// A 256-bit vector of [8 x float].
146 ///
147 /// \code{.operation}
148 /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
149 /// FOR j := 0 to 7
150 /// m := j*32
151 /// dst[m+31:m] := b
152 /// ENDFOR
153 /// dst[MAX:256] := 0
154 /// \endcode
155 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnesh_ps(const void * __A)156 _mm256_bcstnesh_ps(const void *__A) {
157 return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
158 }
159
160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
161 /// stored at memory locations starting at location \a __A to packed
162 /// single-precision (32-bit) floating-point elements, and store the results in
163 /// \a dst.
164 ///
165 /// \headerfile <x86intrin.h>
166 ///
167 /// \code
168 /// _mm_cvtneebf16_ps(const __m128bh *__A);
169 /// \endcode
170 ///
171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
172 ///
173 /// \param __A
174 /// A pointer to a 128-bit memory location containing 8 consecutive
175 /// BF16 (16-bit) floating-point values.
176 /// \returns
177 /// A 128-bit vector of [4 x float].
178 ///
179 /// \code{.operation}
180 /// FOR j := 0 to 3
181 /// k := j*2
182 /// i := k*16
183 /// m := j*32
184 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
185 /// ENDFOR
186 /// dst[MAX:128] := 0
187 /// \endcode
188 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneebf16_ps(const __m128bh * __A)189 _mm_cvtneebf16_ps(const __m128bh *__A) {
190 return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
191 }
192
193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
194 /// stored at memory locations starting at location \a __A to packed
195 /// single-precision (32-bit) floating-point elements, and store the results in
196 /// \a dst.
197 ///
198 /// \headerfile <x86intrin.h>
199 ///
200 /// \code
201 /// _mm256_cvtneebf16_ps(const __m256bh *__A);
202 /// \endcode
203 ///
204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
205 ///
206 /// \param __A
207 /// A pointer to a 256-bit memory location containing 16 consecutive
208 /// BF16 (16-bit) floating-point values.
209 /// \returns
210 /// A 256-bit vector of [8 x float].
211 ///
212 /// \code{.operation}
213 /// FOR j := 0 to 7
214 /// k := j*2
215 /// i := k*16
216 /// m := j*32
217 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
218 /// ENDFOR
219 /// dst[MAX:256] := 0
220 /// \endcode
221 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneebf16_ps(const __m256bh * __A)222 _mm256_cvtneebf16_ps(const __m256bh *__A) {
223 return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
224 }
225
226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
227 /// stored at memory locations starting at location \a __A to packed
228 /// single-precision (32-bit) floating-point elements, and store the results in
229 /// \a dst.
230 ///
231 /// \headerfile <x86intrin.h>
232 ///
233 /// \code
234 /// _mm_cvtneeph_ps(const __m128h *__A);
235 /// \endcode
236 ///
237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
238 ///
239 /// \param __A
240 /// A pointer to a 128-bit memory location containing 8 consecutive
241 /// half-precision (16-bit) floating-point values.
242 /// \returns
243 /// A 128-bit vector of [4 x float].
244 ///
245 /// \code{.operation}
246 /// FOR j := 0 to 3
247 /// k := j*2
248 /// i := k*16
249 /// m := j*32
250 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
251 /// ENDFOR
252 /// dst[MAX:128] := 0
253 /// \endcode
254 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneeph_ps(const __m128h * __A)255 _mm_cvtneeph_ps(const __m128h *__A) {
256 return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
257 }
258
259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
260 /// stored at memory locations starting at location \a __A to packed
261 /// single-precision (32-bit) floating-point elements, and store the results in
262 /// \a dst.
263 ///
264 /// \headerfile <x86intrin.h>
265 ///
266 /// \code
267 /// _mm256_cvtneeph_ps(const __m256h *__A);
268 /// \endcode
269 ///
270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
271 ///
272 /// \param __A
273 /// A pointer to a 256-bit memory location containing 16 consecutive
274 /// half-precision (16-bit) floating-point values.
275 /// \returns
276 /// A 256-bit vector of [8 x float].
277 ///
278 /// \code{.operation}
279 /// FOR j := 0 to 7
280 /// k := j*2
281 /// i := k*16
282 /// m := j*32
283 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
284 /// ENDFOR
285 /// dst[MAX:256] := 0
286 /// \endcode
287 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneeph_ps(const __m256h * __A)288 _mm256_cvtneeph_ps(const __m256h *__A) {
289 return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
290 }
291
292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
293 /// stored at memory locations starting at location \a __A to packed
294 /// single-precision (32-bit) floating-point elements, and store the results in
295 /// \a dst.
296 ///
297 /// \headerfile <x86intrin.h>
298 ///
299 /// \code
300 /// _mm_cvtneobf16_ps(const __m128bh *__A);
301 /// \endcode
302 ///
303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
304 ///
305 /// \param __A
306 /// A pointer to a 128-bit memory location containing 8 consecutive
307 /// BF16 (16-bit) floating-point values.
308 /// \returns
309 /// A 128-bit vector of [4 x float].
310 ///
311 /// \code{.operation}
312 /// FOR j := 0 to 3
313 /// k := j*2+1
314 /// i := k*16
315 /// m := j*32
316 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
317 /// ENDFOR
318 /// dst[MAX:128] := 0
319 /// \endcode
320 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneobf16_ps(const __m128bh * __A)321 _mm_cvtneobf16_ps(const __m128bh *__A) {
322 return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
323 }
324
325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
326 /// stored at memory locations starting at location \a __A to packed
327 /// single-precision (32-bit) floating-point elements, and store the results in
328 /// \a dst.
329 ///
330 /// \headerfile <x86intrin.h>
331 ///
332 /// \code
333 /// _mm256_cvtneobf16_ps(const __m256bh *__A);
334 /// \endcode
335 ///
336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
337 ///
338 /// \param __A
339 /// A pointer to a 256-bit memory location containing 16 consecutive
340 /// BF16 (16-bit) floating-point values.
341 /// \returns
342 /// A 256-bit vector of [8 x float].
343 ///
344 /// \code{.operation}
345 /// FOR j := 0 to 7
346 /// k := j*2+1
347 /// i := k*16
348 /// m := j*32
349 /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
350 /// ENDFOR
351 /// dst[MAX:256] := 0
352 /// \endcode
353 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneobf16_ps(const __m256bh * __A)354 _mm256_cvtneobf16_ps(const __m256bh *__A) {
355 return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
356 }
357
358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
359 /// stored at memory locations starting at location \a __A to packed
360 /// single-precision (32-bit) floating-point elements, and store the results in
361 /// \a dst.
362 ///
363 /// \headerfile <x86intrin.h>
364 ///
365 /// \code
366 /// _mm_cvtneoph_ps(const __m128h *__A);
367 /// \endcode
368 ///
369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
370 ///
371 /// \param __A
372 /// A pointer to a 128-bit memory location containing 8 consecutive
373 /// half-precision (16-bit) floating-point values.
374 /// \returns
375 /// A 128-bit vector of [4 x float].
376 ///
377 /// \code{.operation}
378 /// FOR j := 0 to 3
379 /// k := j*2+1
380 /// i := k*16
381 /// m := j*32
382 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
383 /// ENDFOR
384 /// dst[MAX:128] := 0
385 /// \endcode
386 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneoph_ps(const __m128h * __A)387 _mm_cvtneoph_ps(const __m128h *__A) {
388 return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
389 }
390
391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
392 /// stored at memory locations starting at location \a __A to packed
393 /// single-precision (32-bit) floating-point elements, and store the results in
394 /// \a dst.
395 ///
396 /// \headerfile <x86intrin.h>
397 ///
398 /// \code
399 /// _mm256_cvtneoph_ps(const __m256h *__A);
400 /// \endcode
401 ///
402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
403 ///
404 /// \param __A
405 /// A pointer to a 256-bit memory location containing 16 consecutive
406 /// half-precision (16-bit) floating-point values.
407 /// \returns
408 /// A 256-bit vector of [8 x float].
409 ///
410 /// \code{.operation}
411 /// FOR j := 0 to 7
412 /// k := j*2+1
413 /// i := k*16
414 /// m := j*32
415 /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
416 /// ENDFOR
417 /// dst[MAX:256] := 0
418 /// \endcode
419 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneoph_ps(const __m256h * __A)420 _mm256_cvtneoph_ps(const __m256h *__A) {
421 return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
422 }
423
424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
426 /// dst.
427 ///
428 /// \headerfile <x86intrin.h>
429 ///
430 /// \code
431 /// _mm_cvtneps_avx_pbh(__m128 __A);
432 /// \endcode
433 ///
434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
435 ///
436 /// \param __A
437 /// A 128-bit vector of [4 x float].
438 /// \returns
439 /// A 128-bit vector of [8 x bfloat].
440 ///
441 /// \code{.operation}
442 /// FOR j := 0 to 3
443 /// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
444 /// ENDFOR
445 /// dst[MAX:128] := 0
446 /// \endcode
447 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtneps_avx_pbh(__m128 __A)448 _mm_cvtneps_avx_pbh(__m128 __A) {
449 return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
450 }
451
452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
454 /// dst.
455 ///
456 /// \headerfile <x86intrin.h>
457 ///
458 /// \code
459 /// _mm256_cvtneps_avx_pbh(__m256 __A);
460 /// \endcode
461 ///
462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
463 ///
464 /// \param __A
465 /// A 256-bit vector of [8 x float].
466 /// \returns
467 /// A 128-bit vector of [8 x bfloat].
468 ///
469 /// \code{.operation}
470 /// FOR j := 0 to 7
471 /// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
472 /// ENDFOR
473 /// dst[MAX:128] := 0
474 /// \endcode
475 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_cvtneps_avx_pbh(__m256 __A)476 _mm256_cvtneps_avx_pbh(__m256 __A) {
477 return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
478 }
479
480 #undef __DEFAULT_FN_ATTRS128
481 #undef __DEFAULT_FN_ATTRS256
482
483 #endif // __AVXNECONVERTINTRIN_H
484 #endif // __SSE2__
485