avxneconvertintrin.h - OpenGrok cross reference for /freebsd/contrib/llvm-project/clang/lib/Headers/avxneconvertintrin.h

Lines Matching +full:k +full:- +full:to +full:- +full:j
1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7  *===-----------------------------------------------------------------------===
28 /// Convert scalar BF16 (16-bit) floating-point element
29 /// stored at memory locations starting at location \a __A to a
30 /// single-precision (32-bit) floating-point, broadcast it to packed
31 /// single-precision (32-bit) floating-point elements, and store the results in
40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
43 ///    A pointer to a 16-bit memory location. The address of the memory
44 ///    location does not have to be aligned.
46 ///    A 128-bit vector of [4 x float].
50 /// FOR j := 0 to 3
51 ///   m := j*32
61 /// Convert scalar BF16 (16-bit) floating-point element
62 /// stored at memory locations starting at location \a __A to a
63 /// single-precision (32-bit) floating-point, broadcast it to packed
64 /// single-precision (32-bit) floating-point elements, and store the results in
73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
76 ///    A pointer to a 16-bit memory location. The address of the memory
77 ///    location does not have to be aligned.
79 ///    A 256-bit vector of [8 x float].
83 /// FOR j := 0 to 7
84 ///   m := j*32
94 /// Convert scalar half-precision (16-bit) floating-point element
95 /// stored at memory locations starting at location \a __A to a
96 /// single-precision (32-bit) floating-point, broadcast it to packed
97 /// single-precision (32-bit) floating-point elements, and store the results in
106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
109 ///    A pointer to a 16-bit memory location. The address of the memory
110 ///    location does not have to be aligned.
112 ///    A 128-bit vector of [4 x float].
116 /// FOR j := 0 to 3
117 ///   m := j*32
127 /// Convert scalar half-precision (16-bit) floating-point element
128 /// stored at memory locations starting at location \a __A to a
129 /// single-precision (32-bit) floating-point, broadcast it to packed
130 /// single-precision (32-bit) floating-point elements, and store the results in
139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
142 ///    A pointer to a 16-bit memory location. The address of the memory
143 ///    location does not have to be aligned.
145 ///    A 256-bit vector of [8 x float].
149 /// FOR j := 0 to 7
150 ///   m := j*32
160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
161 /// stored at memory locations starting at location \a __A to packed
162 /// single-precision (32-bit) floating-point elements, and store the results in
171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
174 ///    A pointer to a 128-bit memory location containing 8 consecutive
175 ///    BF16 (16-bit) floating-point values.
177 ///    A 128-bit vector of [4 x float].
180 /// FOR j := 0 to 3
181 /// 	k := j*2
182 /// 	i := k*16
183 /// 	m := j*32
193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
194 /// stored at memory locations starting at location \a __A to packed
195 /// single-precision (32-bit) floating-point elements, and store the results in
204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
207 ///    A pointer to a 256-bit memory location containing 16 consecutive
208 ///    BF16 (16-bit) floating-point values.
210 ///    A 256-bit vector of [8 x float].
213 /// FOR j := 0 to 7
214 /// 	k := j*2
215 /// 	i := k*16
216 /// 	m := j*32
226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
227 /// stored at memory locations starting at location \a __A to packed
228 /// single-precision (32-bit) floating-point elements, and store the results in
237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
240 ///    A pointer to a 128-bit memory location containing 8 consecutive
241 ///    half-precision (16-bit) floating-point values.
243 ///    A 128-bit vector of [4 x float].
246 /// FOR j := 0 to 3
247 /// 	k := j*2
248 /// 	i := k*16
249 /// 	m := j*32
259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
260 /// stored at memory locations starting at location \a __A to packed
261 /// single-precision (32-bit) floating-point elements, and store the results in
270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
273 ///    A pointer to a 256-bit memory location containing 16 consecutive
274 ///    half-precision (16-bit) floating-point values.
276 ///    A 256-bit vector of [8 x float].
279 /// FOR j := 0 to 7
280 /// 	k := j*2
281 /// 	i := k*16
282 /// 	m := j*32
292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
293 /// stored at memory locations starting at location \a __A to packed
294 /// single-precision (32-bit) floating-point elements, and store the results in
303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
306 ///    A pointer to a 128-bit memory location containing 8 consecutive
307 ///    BF16 (16-bit) floating-point values.
309 ///    A 128-bit vector of [4 x float].
312 /// FOR j := 0 to 3
313 /// 	k := j*2+1
314 /// 	i := k*16
315 /// 	m := j*32
325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
326 /// stored at memory locations starting at location \a __A to packed
327 /// single-precision (32-bit) floating-point elements, and store the results in
336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
339 ///    A pointer to a 256-bit memory location containing 16 consecutive
340 ///    BF16 (16-bit) floating-point values.
342 ///    A 256-bit vector of [8 x float].
345 /// FOR j := 0 to 7
346 /// 	k := j*2+1
347 /// 	i := k*16
348 /// 	m := j*32
358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
359 /// stored at memory locations starting at location \a __A to packed
360 /// single-precision (32-bit) floating-point elements, and store the results in
369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
372 ///    A pointer to a 128-bit memory location containing 8 consecutive
373 ///    half-precision (16-bit) floating-point values.
375 ///    A 128-bit vector of [4 x float].
378 /// FOR j := 0 to 3
379 /// 	k := j*2+1
380 /// 	i := k*16
381 /// 	m := j*32
391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
392 /// stored at memory locations starting at location \a __A to packed
393 /// single-precision (32-bit) floating-point elements, and store the results in
402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
405 ///    A pointer to a 256-bit memory location containing 16 consecutive
406 ///    half-precision (16-bit) floating-point values.
408 ///    A 256-bit vector of [8 x float].
411 /// FOR j := 0 to 7
412 /// 	k := j*2+1
413 /// 	i := k*16
414 /// 	m := j*32
424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
437 ///    A 128-bit vector of [4 x float].
439 ///    A 128-bit vector of [8 x bfloat].
442 /// FOR j := 0 to 3
443 /// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
465 ///    A 256-bit vector of [8 x float].
467 ///    A 128-bit vector of [8 x bfloat].
470 /// FOR j := 0 to 7
471 /// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])