Lines Matching +full:k +full:- +full:to +full:- +full:j

1 /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
28 /// Convert scalar BF16 (16-bit) floating-point element
29 /// stored at memory locations starting at location \a __A to a
30 /// single-precision (32-bit) floating-point, broadcast it to packed
31 /// single-precision (32-bit) floating-point elements, and store the results in
40 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
43 /// A pointer to a 16-bit memory location. The address of the memory
44 /// location does not have to be aligned.
46 /// A 128-bit vector of [4 x float].
50 /// FOR j := 0 to 3
51 /// m := j*32
61 /// Convert scalar BF16 (16-bit) floating-point element
62 /// stored at memory locations starting at location \a __A to a
63 /// single-precision (32-bit) floating-point, broadcast it to packed
64 /// single-precision (32-bit) floating-point elements, and store the results in
73 /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
76 /// A pointer to a 16-bit memory location. The address of the memory
77 /// location does not have to be aligned.
79 /// A 256-bit vector of [8 x float].
83 /// FOR j := 0 to 7
84 /// m := j*32
94 /// Convert scalar half-precision (16-bit) floating-point element
95 /// stored at memory locations starting at location \a __A to a
96 /// single-precision (32-bit) floating-point, broadcast it to packed
97 /// single-precision (32-bit) floating-point elements, and store the results in
106 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
109 /// A pointer to a 16-bit memory location. The address of the memory
110 /// location does not have to be aligned.
112 /// A 128-bit vector of [4 x float].
116 /// FOR j := 0 to 3
117 /// m := j*32
127 /// Convert scalar half-precision (16-bit) floating-point element
128 /// stored at memory locations starting at location \a __A to a
129 /// single-precision (32-bit) floating-point, broadcast it to packed
130 /// single-precision (32-bit) floating-point elements, and store the results in
139 /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
142 /// A pointer to a 16-bit memory location. The address of the memory
143 /// location does not have to be aligned.
145 /// A 256-bit vector of [8 x float].
149 /// FOR j := 0 to 7
150 /// m := j*32
160 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
161 /// stored at memory locations starting at location \a __A to packed
162 /// single-precision (32-bit) floating-point elements, and store the results in
171 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
174 /// A pointer to a 128-bit memory location containing 8 consecutive
175 /// BF16 (16-bit) floating-point values.
177 /// A 128-bit vector of [4 x float].
180 /// FOR j := 0 to 3
181 /// k := j*2
182 /// i := k*16
183 /// m := j*32
193 /// Convert packed BF16 (16-bit) floating-point even-indexed elements
194 /// stored at memory locations starting at location \a __A to packed
195 /// single-precision (32-bit) floating-point elements, and store the results in
204 /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
207 /// A pointer to a 256-bit memory location containing 16 consecutive
208 /// BF16 (16-bit) floating-point values.
210 /// A 256-bit vector of [8 x float].
213 /// FOR j := 0 to 7
214 /// k := j*2
215 /// i := k*16
216 /// m := j*32
226 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
227 /// stored at memory locations starting at location \a __A to packed
228 /// single-precision (32-bit) floating-point elements, and store the results in
237 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
240 /// A pointer to a 128-bit memory location containing 8 consecutive
241 /// half-precision (16-bit) floating-point values.
243 /// A 128-bit vector of [4 x float].
246 /// FOR j := 0 to 3
247 /// k := j*2
248 /// i := k*16
249 /// m := j*32
259 /// Convert packed half-precision (16-bit) floating-point even-indexed elements
260 /// stored at memory locations starting at location \a __A to packed
261 /// single-precision (32-bit) floating-point elements, and store the results in
270 /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
273 /// A pointer to a 256-bit memory location containing 16 consecutive
274 /// half-precision (16-bit) floating-point values.
276 /// A 256-bit vector of [8 x float].
279 /// FOR j := 0 to 7
280 /// k := j*2
281 /// i := k*16
282 /// m := j*32
292 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
293 /// stored at memory locations starting at location \a __A to packed
294 /// single-precision (32-bit) floating-point elements, and store the results in
303 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
306 /// A pointer to a 128-bit memory location containing 8 consecutive
307 /// BF16 (16-bit) floating-point values.
309 /// A 128-bit vector of [4 x float].
312 /// FOR j := 0 to 3
313 /// k := j*2+1
314 /// i := k*16
315 /// m := j*32
325 /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
326 /// stored at memory locations starting at location \a __A to packed
327 /// single-precision (32-bit) floating-point elements, and store the results in
336 /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
339 /// A pointer to a 256-bit memory location containing 16 consecutive
340 /// BF16 (16-bit) floating-point values.
342 /// A 256-bit vector of [8 x float].
345 /// FOR j := 0 to 7
346 /// k := j*2+1
347 /// i := k*16
348 /// m := j*32
358 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
359 /// stored at memory locations starting at location \a __A to packed
360 /// single-precision (32-bit) floating-point elements, and store the results in
369 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
372 /// A pointer to a 128-bit memory location containing 8 consecutive
373 /// half-precision (16-bit) floating-point values.
375 /// A 128-bit vector of [4 x float].
378 /// FOR j := 0 to 3
379 /// k := j*2+1
380 /// i := k*16
381 /// m := j*32
391 /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
392 /// stored at memory locations starting at location \a __A to packed
393 /// single-precision (32-bit) floating-point elements, and store the results in
402 /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
405 /// A pointer to a 256-bit memory location containing 16 consecutive
406 /// half-precision (16-bit) floating-point values.
408 /// A 256-bit vector of [8 x float].
411 /// FOR j := 0 to 7
412 /// k := j*2+1
413 /// i := k*16
414 /// m := j*32
424 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
425 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
434 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
437 /// A 128-bit vector of [4 x float].
439 /// A 128-bit vector of [8 x bfloat].
442 /// FOR j := 0 to 3
443 /// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
452 /// Convert packed single-precision (32-bit) floating-point elements in \a __A
453 /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
462 /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
465 /// A 256-bit vector of [8 x float].
467 /// A 128-bit vector of [8 x bfloat].
470 /// FOR j := 0 to 7
471 /// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])