amxintrin.h - OpenGrok cross reference for /freebsd/contrib/llvm-project/clang/lib/Headers/amxintrin.h

Lines Matching +full:row +full:- +full:stride
1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7  *===------------------------------------------------------------------------===
20   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
22   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
24   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
26   __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
28 /// Load tile configuration from a 64-byte memory location specified by
30 /// number of bytes per row, and the number of rows. If the specified
40 ///    A pointer to 512-bits configuration
46 /// Stores the current tile configuration to a 64-byte memory location
48 /// palette, the number of bytes per row, and the number of rows. If tiles
56 ///    A pointer to 512-bits configuration
72 /// Load tile rows from memory specifieid by "base" address and "stride" into
84 /// \param stride
85 ///    The stride between the rows' data to be loaded in memory.
86 #define _tile_loadd(dst, base, stride)                                         \  argument
88                              (__SIZE_TYPE__)(stride))
90 /// Load tile rows from memory specifieid by "base" address and "stride" into
104 /// \param stride
105 ///    The stride between the rows' data to be loaded in memory.
106 #define _tile_stream_loadd(dst, base, stride)                                  \  argument
108                                (__SIZE_TYPE__)(stride))
111 /// "stride" using the tile configuration previously configured via
122 /// \param stride
123 ///    The stride between the rows' data to be stored in memory.
124 #define _tile_stored(dst, base, stride)                                        \  argument
125   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
137 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
138 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
139 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
140 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
141 /// and store the 32-bit result back to tile "dst".
156 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
157 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
158 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
159 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
160 /// in "dst", and store the 32-bit result back to tile "dst".
175 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
176 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
177 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
178 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
179 /// and store the 32-bit result back to tile "dst".
194 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
195 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
196 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
197 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
198 /// "dst", and store the 32-bit result back to tile "dst".
213 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
214 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
215 /// elements with elements in "dst", and store the 32-bit result back to tile
239                      __SIZE_TYPE__ stride) {  in _tile_loadd_internal()  argument
241                                              (__SIZE_TYPE__)(stride));  in _tile_loadd_internal()
247                        __SIZE_TYPE__ stride) {  in _tile_loaddt1_internal()  argument
249                                                (__SIZE_TYPE__)(stride));  in _tile_loaddt1_internal()
283                       __SIZE_TYPE__ stride, _tile1024i tile) {  in _tile_stored_internal()  argument
285                                               (__SIZE_TYPE__)(stride), tile);  in _tile_stored_internal()
307   const unsigned short row;  member
312 /// Load tile rows from memory specifieid by "base" address and "stride" into
323 /// \param stride
324 ///    The stride between the rows' data to be loaded in memory.
327                                     __SIZE_TYPE__ stride) {  in __tile_loadd()  argument
328   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);  in __tile_loadd()
331 /// Load tile rows from memory specifieid by "base" address and "stride" into
344 /// \param stride
345 ///    The stride between the rows' data to be loaded in memory.
348                                            __SIZE_TYPE__ stride) {  in __tile_stream_loadd()  argument
349   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);  in __tile_stream_loadd()
352 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
353 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
354 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
355 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
356 /// and store the 32-bit result back to tile "dst".
371   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,  in __tile_dpbssd()
375 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
376 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
377 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
378 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
379 /// in "dst", and store the 32-bit result back to tile "dst".
394   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,  in __tile_dpbsud()
398 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
399 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
400 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
401 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
402 /// and store the 32-bit result back to tile "dst".
417   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,  in __tile_dpbusd()
421 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
422 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
423 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
424 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
425 /// "dst", and store the 32-bit result back to tile "dst".
440   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,  in __tile_dpbuud()
445 /// "stride".
453 /// \param stride
454 ///    The stride between the rows' data to be stored in memory.
456 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,  in __tile_stored()  argument
458   _tile_stored_internal(src.row, src.col, base, stride, src.tile);  in __tile_stored()
471   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);  in __tile_zero()
474 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
475 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
476 /// elements with elements in "dst", and store the 32-bit result back to tile
492   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,  in __tile_dpbf16ps()
496 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
497 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
498 /// elements with elements in "dst", and store the 32-bit result back to tile
514   dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,  in __tile_dpfp16ps()