xref: /freebsd/contrib/llvm-project/clang/lib/Headers/amxintrin.h (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
15ffd83dbSDimitry Andric /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
25ffd83dbSDimitry Andric  *
35ffd83dbSDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric  *
75ffd83dbSDimitry Andric  *===------------------------------------------------------------------------===
85ffd83dbSDimitry Andric  */
95ffd83dbSDimitry Andric 
105ffd83dbSDimitry Andric #ifndef __IMMINTRIN_H
115ffd83dbSDimitry Andric #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
125ffd83dbSDimitry Andric #endif /* __IMMINTRIN_H */
135ffd83dbSDimitry Andric 
145ffd83dbSDimitry Andric #ifndef __AMXINTRIN_H
155ffd83dbSDimitry Andric #define __AMXINTRIN_H
165ffd83dbSDimitry Andric #ifdef __x86_64__
175ffd83dbSDimitry Andric 
18fe6060f1SDimitry Andric /* Define the default attributes for the functions in this file. */
19e8d8bef9SDimitry Andric #define __DEFAULT_FN_ATTRS_TILE                                                \
205ffd83dbSDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_INT8                                                \
22fe6060f1SDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_BF16                                                \
24fe6060f1SDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
255ffd83dbSDimitry Andric 
265ffd83dbSDimitry Andric /// Load tile configuration from a 64-byte memory location specified by
275ffd83dbSDimitry Andric /// "mem_addr". The tile configuration includes the tile type palette, the
285ffd83dbSDimitry Andric /// number of bytes per row, and the number of rows. If the specified
295ffd83dbSDimitry Andric /// palette_id is zero, that signifies the init state for both the tile
305ffd83dbSDimitry Andric /// config and the tile data, and the tiles are zeroed. Any invalid
315ffd83dbSDimitry Andric /// configurations will result in #GP fault.
325ffd83dbSDimitry Andric ///
33fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
345ffd83dbSDimitry Andric ///
355ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
365ffd83dbSDimitry Andric ///
375ffd83dbSDimitry Andric /// \param __config
385ffd83dbSDimitry Andric ///    A pointer to 512-bits configuration
39e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE
40e8d8bef9SDimitry Andric _tile_loadconfig(const void *__config) {
415ffd83dbSDimitry Andric   __builtin_ia32_tile_loadconfig(__config);
425ffd83dbSDimitry Andric }
435ffd83dbSDimitry Andric 
445ffd83dbSDimitry Andric /// Stores the current tile configuration to a 64-byte memory location
455ffd83dbSDimitry Andric /// specified by "mem_addr". The tile configuration includes the tile type
465ffd83dbSDimitry Andric /// palette, the number of bytes per row, and the number of rows. If tiles
475ffd83dbSDimitry Andric /// are not configured, all zeroes will be stored to memory.
485ffd83dbSDimitry Andric ///
49fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
505ffd83dbSDimitry Andric ///
515ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
525ffd83dbSDimitry Andric ///
535ffd83dbSDimitry Andric /// \param __config
545ffd83dbSDimitry Andric ///    A pointer to 512-bits configuration
55e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE
56e8d8bef9SDimitry Andric _tile_storeconfig(void *__config) {
575ffd83dbSDimitry Andric   __builtin_ia32_tile_storeconfig(__config);
585ffd83dbSDimitry Andric }
595ffd83dbSDimitry Andric 
605ffd83dbSDimitry Andric /// Release the tile configuration to return to the init state, which
615ffd83dbSDimitry Andric /// releases all storage it currently holds.
625ffd83dbSDimitry Andric ///
63fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
645ffd83dbSDimitry Andric ///
655ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
66e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
675ffd83dbSDimitry Andric   __builtin_ia32_tilerelease();
685ffd83dbSDimitry Andric }
695ffd83dbSDimitry Andric 
705ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
715ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured
725ffd83dbSDimitry Andric /// via "_tile_loadconfig".
735ffd83dbSDimitry Andric ///
74fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
755ffd83dbSDimitry Andric ///
765ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
775ffd83dbSDimitry Andric ///
785ffd83dbSDimitry Andric /// \param dst
795ffd83dbSDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
805ffd83dbSDimitry Andric /// \param base
815ffd83dbSDimitry Andric ///    A pointer to base address.
825ffd83dbSDimitry Andric /// \param stride
835ffd83dbSDimitry Andric ///    The stride between the rows' data to be loaded in memory.
845ffd83dbSDimitry Andric #define _tile_loadd(dst, base, stride)                                         \
85e8d8bef9SDimitry Andric   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
86e8d8bef9SDimitry Andric                              (__SIZE_TYPE__)(stride))
875ffd83dbSDimitry Andric 
885ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
895ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured
905ffd83dbSDimitry Andric /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
915ffd83dbSDimitry Andric /// that the data will likely not be reused in the near future and the data
925ffd83dbSDimitry Andric /// caching can be optimized accordingly.
935ffd83dbSDimitry Andric ///
94fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
955ffd83dbSDimitry Andric ///
965ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
975ffd83dbSDimitry Andric ///
985ffd83dbSDimitry Andric /// \param dst
995ffd83dbSDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
1005ffd83dbSDimitry Andric /// \param base
1015ffd83dbSDimitry Andric ///    A pointer to base address.
1025ffd83dbSDimitry Andric /// \param stride
1035ffd83dbSDimitry Andric ///    The stride between the rows' data to be loaded in memory.
1045ffd83dbSDimitry Andric #define _tile_stream_loadd(dst, base, stride)                                  \
105e8d8bef9SDimitry Andric   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
106e8d8bef9SDimitry Andric                                (__SIZE_TYPE__)(stride))
1075ffd83dbSDimitry Andric 
1085ffd83dbSDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and
1095ffd83dbSDimitry Andric /// "stride" using the tile configuration previously configured via
1105ffd83dbSDimitry Andric /// "_tile_loadconfig".
1115ffd83dbSDimitry Andric ///
112fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1135ffd83dbSDimitry Andric ///
1145ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
1155ffd83dbSDimitry Andric ///
1165ffd83dbSDimitry Andric /// \param dst
1175ffd83dbSDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
1185ffd83dbSDimitry Andric /// \param base
1195ffd83dbSDimitry Andric ///    A pointer to base address.
1205ffd83dbSDimitry Andric /// \param stride
1215ffd83dbSDimitry Andric ///    The stride between the rows' data to be stored in memory.
1225ffd83dbSDimitry Andric #define _tile_stored(dst, base, stride)                                        \
1235ffd83dbSDimitry Andric   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
1245ffd83dbSDimitry Andric 
1255ffd83dbSDimitry Andric /// Zero the tile specified by "tdest".
1265ffd83dbSDimitry Andric ///
127fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1285ffd83dbSDimitry Andric ///
1295ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
1305ffd83dbSDimitry Andric ///
1315ffd83dbSDimitry Andric /// \param tile
1325ffd83dbSDimitry Andric ///    The destination tile to be zero. Max size is 1024 Bytes.
1335ffd83dbSDimitry Andric #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
1345ffd83dbSDimitry Andric 
1355ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1365ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
1375ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
1385ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
1395ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst".
1405ffd83dbSDimitry Andric ///
141fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1425ffd83dbSDimitry Andric ///
1435ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
1445ffd83dbSDimitry Andric ///
1455ffd83dbSDimitry Andric /// \param dst
1465ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
1475ffd83dbSDimitry Andric /// \param src0
1485ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
1495ffd83dbSDimitry Andric /// \param src1
1505ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
151e8d8bef9SDimitry Andric #define _tile_dpbssd(dst, src0, src1)                                          \
152e8d8bef9SDimitry Andric   __builtin_ia32_tdpbssd((dst), (src0), (src1))
1535ffd83dbSDimitry Andric 
1545ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1555ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
1565ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
1575ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
1585ffd83dbSDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst".
1595ffd83dbSDimitry Andric ///
160fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1615ffd83dbSDimitry Andric ///
1625ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
1635ffd83dbSDimitry Andric ///
1645ffd83dbSDimitry Andric /// \param dst
1655ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
1665ffd83dbSDimitry Andric /// \param src0
1675ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
1685ffd83dbSDimitry Andric /// \param src1
1695ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
170e8d8bef9SDimitry Andric #define _tile_dpbsud(dst, src0, src1)                                          \
171e8d8bef9SDimitry Andric   __builtin_ia32_tdpbsud((dst), (src0), (src1))
1725ffd83dbSDimitry Andric 
1735ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1745ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
1755ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
1765ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
1775ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst".
1785ffd83dbSDimitry Andric ///
179fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1805ffd83dbSDimitry Andric ///
1815ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
1825ffd83dbSDimitry Andric ///
1835ffd83dbSDimitry Andric /// \param dst
1845ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
1855ffd83dbSDimitry Andric /// \param src0
1865ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
1875ffd83dbSDimitry Andric /// \param src1
1885ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
189e8d8bef9SDimitry Andric #define _tile_dpbusd(dst, src0, src1)                                          \
190e8d8bef9SDimitry Andric   __builtin_ia32_tdpbusd((dst), (src0), (src1))
1915ffd83dbSDimitry Andric 
1925ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1935ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
1945ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
1955ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
1965ffd83dbSDimitry Andric /// "dst", and store the 32-bit result back to tile "dst".
1975ffd83dbSDimitry Andric ///
198fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1995ffd83dbSDimitry Andric ///
2005ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
2015ffd83dbSDimitry Andric ///
2025ffd83dbSDimitry Andric /// \param dst
2035ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
2045ffd83dbSDimitry Andric /// \param src0
2055ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
2065ffd83dbSDimitry Andric /// \param src1
2075ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
208e8d8bef9SDimitry Andric #define _tile_dpbuud(dst, src0, src1)                                          \
209e8d8bef9SDimitry Andric   __builtin_ia32_tdpbuud((dst), (src0), (src1))
2105ffd83dbSDimitry Andric 
2115ffd83dbSDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
2125ffd83dbSDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
2135ffd83dbSDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
2145ffd83dbSDimitry Andric /// "dst".
2155ffd83dbSDimitry Andric ///
216fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
2175ffd83dbSDimitry Andric ///
2185ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
2195ffd83dbSDimitry Andric ///
2205ffd83dbSDimitry Andric /// \param dst
2215ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
2225ffd83dbSDimitry Andric /// \param src0
2235ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
2245ffd83dbSDimitry Andric /// \param src1
2255ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
2265ffd83dbSDimitry Andric #define _tile_dpbf16ps(dst, src0, src1)                                        \
2275ffd83dbSDimitry Andric   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
2285ffd83dbSDimitry Andric 
229fe6060f1SDimitry Andric /// AMX tile register size can be configured, the maximum size is 16x64=1024
230fe6060f1SDimitry Andric /// bytes. Since there is no 2D type in llvm IR, we use vector type to
231fe6060f1SDimitry Andric /// represent 2D tile and the fixed size is maximum amx tile register size.
232e8d8bef9SDimitry Andric typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
233fe6060f1SDimitry Andric 
234fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
235e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
236e8d8bef9SDimitry Andric _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
237e8d8bef9SDimitry Andric                      __SIZE_TYPE__ stride) {
238e8d8bef9SDimitry Andric   return __builtin_ia32_tileloadd64_internal(m, n, base,
239e8d8bef9SDimitry Andric                                              (__SIZE_TYPE__)(stride));
240e8d8bef9SDimitry Andric }
241e8d8bef9SDimitry Andric 
242fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
243fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
244fe6060f1SDimitry Andric _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
245fe6060f1SDimitry Andric                        __SIZE_TYPE__ stride) {
246fe6060f1SDimitry Andric   return __builtin_ia32_tileloaddt164_internal(m, n, base,
247fe6060f1SDimitry Andric                                                (__SIZE_TYPE__)(stride));
248fe6060f1SDimitry Andric }
249fe6060f1SDimitry Andric 
250fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
251e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
252e8d8bef9SDimitry Andric _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
253e8d8bef9SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
254e8d8bef9SDimitry Andric   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
255e8d8bef9SDimitry Andric }
256e8d8bef9SDimitry Andric 
257fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
258fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
259fe6060f1SDimitry Andric _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
260fe6060f1SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
261fe6060f1SDimitry Andric   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
262fe6060f1SDimitry Andric }
263fe6060f1SDimitry Andric 
264fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
265fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
266fe6060f1SDimitry Andric _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
267fe6060f1SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
268fe6060f1SDimitry Andric   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
269fe6060f1SDimitry Andric }
270fe6060f1SDimitry Andric 
271fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
272fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
273fe6060f1SDimitry Andric _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
274fe6060f1SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
275fe6060f1SDimitry Andric   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
276fe6060f1SDimitry Andric }
277fe6060f1SDimitry Andric 
278fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
279e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_INT8
280e8d8bef9SDimitry Andric _tile_stored_internal(unsigned short m, unsigned short n, void *base,
281e8d8bef9SDimitry Andric                       __SIZE_TYPE__ stride, _tile1024i tile) {
282e8d8bef9SDimitry Andric   return __builtin_ia32_tilestored64_internal(m, n, base,
283e8d8bef9SDimitry Andric                                               (__SIZE_TYPE__)(stride), tile);
284e8d8bef9SDimitry Andric }
285e8d8bef9SDimitry Andric 
286fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
287fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
288fe6060f1SDimitry Andric _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
289fe6060f1SDimitry Andric                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
290fe6060f1SDimitry Andric   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
291fe6060f1SDimitry Andric }
292fe6060f1SDimitry Andric 
293fe6060f1SDimitry Andric /// This struct pack the shape and tile data together for user. We suggest
294fe6060f1SDimitry Andric /// initializing the struct as early as possible, because compiler depends
295fe6060f1SDimitry Andric /// on the shape information to do configure. The constant value is preferred
296fe6060f1SDimitry Andric /// for optimization by compiler.
297e8d8bef9SDimitry Andric typedef struct __tile1024i_str {
298e8d8bef9SDimitry Andric   const unsigned short row;
299e8d8bef9SDimitry Andric   const unsigned short col;
300e8d8bef9SDimitry Andric   _tile1024i tile;
301e8d8bef9SDimitry Andric } __tile1024i;
302e8d8bef9SDimitry Andric 
303fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
304fe6060f1SDimitry Andric /// destination tile "dst".
305fe6060f1SDimitry Andric ///
306fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
307fe6060f1SDimitry Andric ///
308fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
309fe6060f1SDimitry Andric ///
310fe6060f1SDimitry Andric /// \param dst
311fe6060f1SDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
312fe6060f1SDimitry Andric /// \param base
313fe6060f1SDimitry Andric ///    A pointer to base address.
314fe6060f1SDimitry Andric /// \param stride
315fe6060f1SDimitry Andric ///    The stride between the rows' data to be loaded in memory.
316e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
317*349cc55cSDimitry Andric static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
318e8d8bef9SDimitry Andric                                     __SIZE_TYPE__ stride) {
319e8d8bef9SDimitry Andric   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
320e8d8bef9SDimitry Andric }
321e8d8bef9SDimitry Andric 
322fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
323fe6060f1SDimitry Andric /// destination tile "dst". This intrinsic provides a hint to the implementation
324fe6060f1SDimitry Andric /// that the data will likely not be reused in the near future and the data
325fe6060f1SDimitry Andric /// caching can be optimized accordingly.
326fe6060f1SDimitry Andric ///
327fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
328fe6060f1SDimitry Andric ///
329fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
330fe6060f1SDimitry Andric ///
331fe6060f1SDimitry Andric /// \param dst
332fe6060f1SDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
333fe6060f1SDimitry Andric /// \param base
334fe6060f1SDimitry Andric ///    A pointer to base address.
335fe6060f1SDimitry Andric /// \param stride
336fe6060f1SDimitry Andric ///    The stride between the rows' data to be loaded in memory.
337fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_TILE
338*349cc55cSDimitry Andric static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
339fe6060f1SDimitry Andric                                            __SIZE_TYPE__ stride) {
340fe6060f1SDimitry Andric   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
341e8d8bef9SDimitry Andric }
342e8d8bef9SDimitry Andric 
343fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
344fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
345fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
346fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
347fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst".
348fe6060f1SDimitry Andric ///
349fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
350fe6060f1SDimitry Andric ///
351fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
352fe6060f1SDimitry Andric ///
353fe6060f1SDimitry Andric /// \param dst
354fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
355fe6060f1SDimitry Andric /// \param src0
356fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
357fe6060f1SDimitry Andric /// \param src1
358fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
359fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
360*349cc55cSDimitry Andric static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
361fe6060f1SDimitry Andric                                      __tile1024i src1) {
362fe6060f1SDimitry Andric   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
363fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
364fe6060f1SDimitry Andric }
365fe6060f1SDimitry Andric 
366fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
367fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
368fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
369fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
370fe6060f1SDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst".
371fe6060f1SDimitry Andric ///
372fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
373fe6060f1SDimitry Andric ///
374fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
375fe6060f1SDimitry Andric ///
376fe6060f1SDimitry Andric /// \param dst
377fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
378fe6060f1SDimitry Andric /// \param src0
379fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
380fe6060f1SDimitry Andric /// \param src1
381fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
382fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
383*349cc55cSDimitry Andric static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
384fe6060f1SDimitry Andric                                      __tile1024i src1) {
385fe6060f1SDimitry Andric   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
386fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
387fe6060f1SDimitry Andric }
388fe6060f1SDimitry Andric 
389fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
390fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
391fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
392fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
393fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst".
394fe6060f1SDimitry Andric ///
395fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
396fe6060f1SDimitry Andric ///
397fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
398fe6060f1SDimitry Andric ///
399fe6060f1SDimitry Andric /// \param dst
400fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
401fe6060f1SDimitry Andric /// \param src0
402fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
403fe6060f1SDimitry Andric /// \param src1
404fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
405fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
406*349cc55cSDimitry Andric static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
407fe6060f1SDimitry Andric                                      __tile1024i src1) {
408fe6060f1SDimitry Andric   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
409fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
410fe6060f1SDimitry Andric }
411fe6060f1SDimitry Andric 
412fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
413fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
414fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
415fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
416fe6060f1SDimitry Andric /// "dst", and store the 32-bit result back to tile "dst".
417fe6060f1SDimitry Andric ///
418fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
419fe6060f1SDimitry Andric ///
420fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
421fe6060f1SDimitry Andric ///
422fe6060f1SDimitry Andric /// \param dst
423fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
424fe6060f1SDimitry Andric /// \param src0
425fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
426fe6060f1SDimitry Andric /// \param src1
427fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
428fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
429*349cc55cSDimitry Andric static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
430fe6060f1SDimitry Andric                                      __tile1024i src1) {
431fe6060f1SDimitry Andric   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
432fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
433fe6060f1SDimitry Andric }
434fe6060f1SDimitry Andric 
435fe6060f1SDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and
436fe6060f1SDimitry Andric /// "stride".
437fe6060f1SDimitry Andric ///
438fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
439fe6060f1SDimitry Andric ///
440fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
441fe6060f1SDimitry Andric ///
442fe6060f1SDimitry Andric /// \param dst
443fe6060f1SDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
444fe6060f1SDimitry Andric /// \param base
445fe6060f1SDimitry Andric ///    A pointer to base address.
446fe6060f1SDimitry Andric /// \param stride
447fe6060f1SDimitry Andric ///    The stride between the rows' data to be stored in memory.
448e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
449*349cc55cSDimitry Andric static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
450*349cc55cSDimitry Andric                                      __tile1024i src) {
451e8d8bef9SDimitry Andric   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
452e8d8bef9SDimitry Andric }
453e8d8bef9SDimitry Andric 
454fe6060f1SDimitry Andric /// Zero the tile specified by "dst".
455fe6060f1SDimitry Andric ///
456fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
457fe6060f1SDimitry Andric ///
458fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
459fe6060f1SDimitry Andric ///
460fe6060f1SDimitry Andric /// \param dst
461fe6060f1SDimitry Andric ///    The destination tile to be zero. Max size is 1024 Bytes.
462e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
463*349cc55cSDimitry Andric static __inline__ void __tile_zero(__tile1024i *dst) {
464e8d8bef9SDimitry Andric   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
465e8d8bef9SDimitry Andric }
4665ffd83dbSDimitry Andric 
467fe6060f1SDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
468fe6060f1SDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
469fe6060f1SDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
470fe6060f1SDimitry Andric /// "dst".
471fe6060f1SDimitry Andric ///
472fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
473fe6060f1SDimitry Andric ///
474fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
475fe6060f1SDimitry Andric ///
476fe6060f1SDimitry Andric /// \param dst
477fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
478fe6060f1SDimitry Andric /// \param src0
479fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
480fe6060f1SDimitry Andric /// \param src1
481fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
482fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_BF16
483*349cc55cSDimitry Andric static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
484fe6060f1SDimitry Andric                                        __tile1024i src1) {
485fe6060f1SDimitry Andric   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
486fe6060f1SDimitry Andric                                       src0.tile, src1.tile);
487fe6060f1SDimitry Andric }
488fe6060f1SDimitry Andric 
489fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_TILE
490fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_INT8
491fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_BF16
492fe6060f1SDimitry Andric 
4935ffd83dbSDimitry Andric #endif /* __x86_64__ */
4945ffd83dbSDimitry Andric #endif /* __AMXINTRIN_H */
495