Lines Matching +full:row +full:- +full:stride
1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===------------------------------------------------------------------------===
20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
26 __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
28 /// Load tile configuration from a 64-byte memory location specified by
30 /// number of bytes per row, and the number of rows. If the specified
40 /// A pointer to 512-bits configuration
46 /// Stores the current tile configuration to a 64-byte memory location
48 /// palette, the number of bytes per row, and the number of rows. If tiles
56 /// A pointer to 512-bits configuration
72 /// Load tile rows from memory specifieid by "base" address and "stride" into
84 /// \param stride
85 /// The stride between the rows' data to be loaded in memory.
86 #define _tile_loadd(dst, base, stride) \ argument
88 (__SIZE_TYPE__)(stride))
90 /// Load tile rows from memory specifieid by "base" address and "stride" into
104 /// \param stride
105 /// The stride between the rows' data to be loaded in memory.
106 #define _tile_stream_loadd(dst, base, stride) \ argument
108 (__SIZE_TYPE__)(stride))
111 /// "stride" using the tile configuration previously configured via
122 /// \param stride
123 /// The stride between the rows' data to be stored in memory.
124 #define _tile_stored(dst, base, stride) \ argument
125 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
137 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
138 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
139 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
140 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
141 /// and store the 32-bit result back to tile "dst".
156 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
157 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
158 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
159 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
160 /// in "dst", and store the 32-bit result back to tile "dst".
175 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
176 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
177 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
178 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
179 /// and store the 32-bit result back to tile "dst".
194 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
195 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
196 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
197 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
198 /// "dst", and store the 32-bit result back to tile "dst".
213 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
214 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
215 /// elements with elements in "dst", and store the 32-bit result back to tile
239 __SIZE_TYPE__ stride) { in _tile_loadd_internal() argument
241 (__SIZE_TYPE__)(stride)); in _tile_loadd_internal()
247 __SIZE_TYPE__ stride) { in _tile_loaddt1_internal() argument
249 (__SIZE_TYPE__)(stride)); in _tile_loaddt1_internal()
283 __SIZE_TYPE__ stride, _tile1024i tile) { in _tile_stored_internal() argument
285 (__SIZE_TYPE__)(stride), tile); in _tile_stored_internal()
307 const unsigned short row; member
312 /// Load tile rows from memory specifieid by "base" address and "stride" into
323 /// \param stride
324 /// The stride between the rows' data to be loaded in memory.
327 __SIZE_TYPE__ stride) { in __tile_loadd() argument
328 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); in __tile_loadd()
331 /// Load tile rows from memory specifieid by "base" address and "stride" into
344 /// \param stride
345 /// The stride between the rows' data to be loaded in memory.
348 __SIZE_TYPE__ stride) { in __tile_stream_loadd() argument
349 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); in __tile_stream_loadd()
352 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
353 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
354 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
355 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
356 /// and store the 32-bit result back to tile "dst".
371 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, in __tile_dpbssd()
375 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
376 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
377 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
378 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
379 /// in "dst", and store the 32-bit result back to tile "dst".
394 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, in __tile_dpbsud()
398 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
399 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
400 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
401 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
402 /// and store the 32-bit result back to tile "dst".
417 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, in __tile_dpbusd()
421 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
422 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
423 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
424 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
425 /// "dst", and store the 32-bit result back to tile "dst".
440 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, in __tile_dpbuud()
445 /// "stride".
453 /// \param stride
454 /// The stride between the rows' data to be stored in memory.
456 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride, in __tile_stored() argument
458 _tile_stored_internal(src.row, src.col, base, stride, src.tile); in __tile_stored()
471 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); in __tile_zero()
474 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
475 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
476 /// elements with elements in "dst", and store the 32-bit result back to tile
492 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, in __tile_dpbf16ps()
496 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
497 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
498 /// elements with elements in "dst", and store the 32-bit result back to tile
514 dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, in __tile_dpfp16ps()