xref: /freebsd/contrib/llvm-project/clang/lib/Headers/amxintrin.h (revision fe6060f10f634930ff71b7c50291ddc610da2475)
15ffd83dbSDimitry Andric /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
25ffd83dbSDimitry Andric  *
35ffd83dbSDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric  *
75ffd83dbSDimitry Andric  *===------------------------------------------------------------------------===
85ffd83dbSDimitry Andric  */
95ffd83dbSDimitry Andric 
105ffd83dbSDimitry Andric #ifndef __IMMINTRIN_H
115ffd83dbSDimitry Andric #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
125ffd83dbSDimitry Andric #endif /* __IMMINTRIN_H */
135ffd83dbSDimitry Andric 
145ffd83dbSDimitry Andric #ifndef __AMXINTRIN_H
155ffd83dbSDimitry Andric #define __AMXINTRIN_H
165ffd83dbSDimitry Andric #ifdef __x86_64__
175ffd83dbSDimitry Andric 
18*fe6060f1SDimitry Andric /* Define the default attributes for the functions in this file. */
19e8d8bef9SDimitry Andric #define __DEFAULT_FN_ATTRS_TILE                                                \
205ffd83dbSDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21*fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_INT8                                                \
22*fe6060f1SDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23*fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_BF16                                                \
24*fe6060f1SDimitry Andric   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
255ffd83dbSDimitry Andric 
265ffd83dbSDimitry Andric /// Load tile configuration from a 64-byte memory location specified by
275ffd83dbSDimitry Andric /// "mem_addr". The tile configuration includes the tile type palette, the
285ffd83dbSDimitry Andric /// number of bytes per row, and the number of rows. If the specified
295ffd83dbSDimitry Andric /// palette_id is zero, that signifies the init state for both the tile
305ffd83dbSDimitry Andric /// config and the tile data, and the tiles are zeroed. Any invalid
315ffd83dbSDimitry Andric /// configurations will result in #GP fault.
325ffd83dbSDimitry Andric ///
33*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
345ffd83dbSDimitry Andric ///
355ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
365ffd83dbSDimitry Andric ///
375ffd83dbSDimitry Andric /// \param __config
385ffd83dbSDimitry Andric ///    A pointer to 512-bits configuration
39e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE
40e8d8bef9SDimitry Andric _tile_loadconfig(const void *__config) {
415ffd83dbSDimitry Andric   __builtin_ia32_tile_loadconfig(__config);
425ffd83dbSDimitry Andric }
435ffd83dbSDimitry Andric 
445ffd83dbSDimitry Andric /// Stores the current tile configuration to a 64-byte memory location
455ffd83dbSDimitry Andric /// specified by "mem_addr". The tile configuration includes the tile type
465ffd83dbSDimitry Andric /// palette, the number of bytes per row, and the number of rows. If tiles
475ffd83dbSDimitry Andric /// are not configured, all zeroes will be stored to memory.
485ffd83dbSDimitry Andric ///
49*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
505ffd83dbSDimitry Andric ///
515ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
525ffd83dbSDimitry Andric ///
535ffd83dbSDimitry Andric /// \param __config
545ffd83dbSDimitry Andric ///    A pointer to 512-bits configuration
55e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE
56e8d8bef9SDimitry Andric _tile_storeconfig(void *__config) {
575ffd83dbSDimitry Andric   __builtin_ia32_tile_storeconfig(__config);
585ffd83dbSDimitry Andric }
595ffd83dbSDimitry Andric 
605ffd83dbSDimitry Andric /// Release the tile configuration to return to the init state, which
615ffd83dbSDimitry Andric /// releases all storage it currently holds.
625ffd83dbSDimitry Andric ///
63*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
645ffd83dbSDimitry Andric ///
655ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
66e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
675ffd83dbSDimitry Andric   __builtin_ia32_tilerelease();
685ffd83dbSDimitry Andric }
695ffd83dbSDimitry Andric 
705ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
715ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured
725ffd83dbSDimitry Andric /// via "_tile_loadconfig".
735ffd83dbSDimitry Andric ///
74*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
755ffd83dbSDimitry Andric ///
765ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
775ffd83dbSDimitry Andric ///
785ffd83dbSDimitry Andric /// \param dst
795ffd83dbSDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
805ffd83dbSDimitry Andric /// \param base
815ffd83dbSDimitry Andric ///    A pointer to base address.
825ffd83dbSDimitry Andric /// \param stride
835ffd83dbSDimitry Andric ///    The stride between the rows' data to be loaded in memory.
845ffd83dbSDimitry Andric #define _tile_loadd(dst, base, stride)                                         \
85e8d8bef9SDimitry Andric   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
86e8d8bef9SDimitry Andric                              (__SIZE_TYPE__)(stride))
875ffd83dbSDimitry Andric 
885ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
895ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured
905ffd83dbSDimitry Andric /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
915ffd83dbSDimitry Andric /// that the data will likely not be reused in the near future and the data
925ffd83dbSDimitry Andric /// caching can be optimized accordingly.
935ffd83dbSDimitry Andric ///
94*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
955ffd83dbSDimitry Andric ///
965ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
975ffd83dbSDimitry Andric ///
985ffd83dbSDimitry Andric /// \param dst
995ffd83dbSDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
1005ffd83dbSDimitry Andric /// \param base
1015ffd83dbSDimitry Andric ///    A pointer to base address.
1025ffd83dbSDimitry Andric /// \param stride
1035ffd83dbSDimitry Andric ///    The stride between the rows' data to be loaded in memory.
1045ffd83dbSDimitry Andric #define _tile_stream_loadd(dst, base, stride)                                  \
105e8d8bef9SDimitry Andric   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
106e8d8bef9SDimitry Andric                                (__SIZE_TYPE__)(stride))
1075ffd83dbSDimitry Andric 
1085ffd83dbSDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and
1095ffd83dbSDimitry Andric /// "stride" using the tile configuration previously configured via
1105ffd83dbSDimitry Andric /// "_tile_loadconfig".
1115ffd83dbSDimitry Andric ///
112*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1135ffd83dbSDimitry Andric ///
1145ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
1155ffd83dbSDimitry Andric ///
1165ffd83dbSDimitry Andric /// \param dst
1175ffd83dbSDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
1185ffd83dbSDimitry Andric /// \param base
1195ffd83dbSDimitry Andric ///    A pointer to base address.
1205ffd83dbSDimitry Andric /// \param stride
1215ffd83dbSDimitry Andric ///    The stride between the rows' data to be stored in memory.
1225ffd83dbSDimitry Andric #define _tile_stored(dst, base, stride)                                        \
1235ffd83dbSDimitry Andric   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
1245ffd83dbSDimitry Andric 
1255ffd83dbSDimitry Andric /// Zero the tile specified by "tdest".
1265ffd83dbSDimitry Andric ///
127*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1285ffd83dbSDimitry Andric ///
1295ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
1305ffd83dbSDimitry Andric ///
1315ffd83dbSDimitry Andric /// \param tile
1325ffd83dbSDimitry Andric ///    The destination tile to be zero. Max size is 1024 Bytes.
1335ffd83dbSDimitry Andric #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
1345ffd83dbSDimitry Andric 
1355ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1365ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
1375ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
1385ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
1395ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst".
1405ffd83dbSDimitry Andric ///
141*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1425ffd83dbSDimitry Andric ///
1435ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
1445ffd83dbSDimitry Andric ///
1455ffd83dbSDimitry Andric /// \param dst
1465ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
1475ffd83dbSDimitry Andric /// \param src0
1485ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
1495ffd83dbSDimitry Andric /// \param src1
1505ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
151e8d8bef9SDimitry Andric #define _tile_dpbssd(dst, src0, src1)                                          \
152e8d8bef9SDimitry Andric   __builtin_ia32_tdpbssd((dst), (src0), (src1))
1535ffd83dbSDimitry Andric 
1545ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1555ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
1565ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
1575ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
1585ffd83dbSDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst".
1595ffd83dbSDimitry Andric ///
160*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1615ffd83dbSDimitry Andric ///
1625ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
1635ffd83dbSDimitry Andric ///
1645ffd83dbSDimitry Andric /// \param dst
1655ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
1665ffd83dbSDimitry Andric /// \param src0
1675ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
1685ffd83dbSDimitry Andric /// \param src1
1695ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
170e8d8bef9SDimitry Andric #define _tile_dpbsud(dst, src0, src1)                                          \
171e8d8bef9SDimitry Andric   __builtin_ia32_tdpbsud((dst), (src0), (src1))
1725ffd83dbSDimitry Andric 
1735ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1745ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
1755ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
1765ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
1775ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst".
1785ffd83dbSDimitry Andric ///
179*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1805ffd83dbSDimitry Andric ///
1815ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
1825ffd83dbSDimitry Andric ///
1835ffd83dbSDimitry Andric /// \param dst
1845ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
1855ffd83dbSDimitry Andric /// \param src0
1865ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
1875ffd83dbSDimitry Andric /// \param src1
1885ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
189e8d8bef9SDimitry Andric #define _tile_dpbusd(dst, src0, src1)                                          \
190e8d8bef9SDimitry Andric   __builtin_ia32_tdpbusd((dst), (src0), (src1))
1915ffd83dbSDimitry Andric 
1925ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
1935ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
1945ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
1955ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
1965ffd83dbSDimitry Andric /// "dst", and store the 32-bit result back to tile "dst".
1975ffd83dbSDimitry Andric ///
198*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
1995ffd83dbSDimitry Andric ///
2005ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
2015ffd83dbSDimitry Andric ///
2025ffd83dbSDimitry Andric /// \param dst
2035ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
2045ffd83dbSDimitry Andric /// \param src0
2055ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
2065ffd83dbSDimitry Andric /// \param src1
2075ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
208e8d8bef9SDimitry Andric #define _tile_dpbuud(dst, src0, src1)                                          \
209e8d8bef9SDimitry Andric   __builtin_ia32_tdpbuud((dst), (src0), (src1))
2105ffd83dbSDimitry Andric 
2115ffd83dbSDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
2125ffd83dbSDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
2135ffd83dbSDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
2145ffd83dbSDimitry Andric /// "dst".
2155ffd83dbSDimitry Andric ///
216*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
2175ffd83dbSDimitry Andric ///
2185ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
2195ffd83dbSDimitry Andric ///
2205ffd83dbSDimitry Andric /// \param dst
2215ffd83dbSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
2225ffd83dbSDimitry Andric /// \param src0
2235ffd83dbSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
2245ffd83dbSDimitry Andric /// \param src1
2255ffd83dbSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
2265ffd83dbSDimitry Andric #define _tile_dpbf16ps(dst, src0, src1)                                        \
2275ffd83dbSDimitry Andric   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
2285ffd83dbSDimitry Andric 
229*fe6060f1SDimitry Andric /// AMX tile register size can be configured, the maximum size is 16x64=1024
230*fe6060f1SDimitry Andric /// bytes. Since there is no 2D type in llvm IR, we use vector type to
231*fe6060f1SDimitry Andric /// represent 2D tile and the fixed size is maximum amx tile register size.
232e8d8bef9SDimitry Andric typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
233*fe6060f1SDimitry Andric 
234*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
235e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
236e8d8bef9SDimitry Andric _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
237e8d8bef9SDimitry Andric                      __SIZE_TYPE__ stride) {
238e8d8bef9SDimitry Andric   return __builtin_ia32_tileloadd64_internal(m, n, base,
239e8d8bef9SDimitry Andric                                              (__SIZE_TYPE__)(stride));
240e8d8bef9SDimitry Andric }
241e8d8bef9SDimitry Andric 
242*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
243*fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
244*fe6060f1SDimitry Andric _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
245*fe6060f1SDimitry Andric                        __SIZE_TYPE__ stride) {
246*fe6060f1SDimitry Andric   return __builtin_ia32_tileloaddt164_internal(m, n, base,
247*fe6060f1SDimitry Andric                                                (__SIZE_TYPE__)(stride));
248*fe6060f1SDimitry Andric }
249*fe6060f1SDimitry Andric 
250*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
251e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
252e8d8bef9SDimitry Andric _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
253e8d8bef9SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
254e8d8bef9SDimitry Andric   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
255e8d8bef9SDimitry Andric }
256e8d8bef9SDimitry Andric 
257*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
258*fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
259*fe6060f1SDimitry Andric _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
260*fe6060f1SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
261*fe6060f1SDimitry Andric   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
262*fe6060f1SDimitry Andric }
263*fe6060f1SDimitry Andric 
264*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
265*fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
266*fe6060f1SDimitry Andric _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
267*fe6060f1SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
268*fe6060f1SDimitry Andric   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
269*fe6060f1SDimitry Andric }
270*fe6060f1SDimitry Andric 
271*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
272*fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
273*fe6060f1SDimitry Andric _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
274*fe6060f1SDimitry Andric                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
275*fe6060f1SDimitry Andric   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
276*fe6060f1SDimitry Andric }
277*fe6060f1SDimitry Andric 
278*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
279e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_INT8
280e8d8bef9SDimitry Andric _tile_stored_internal(unsigned short m, unsigned short n, void *base,
281e8d8bef9SDimitry Andric                       __SIZE_TYPE__ stride, _tile1024i tile) {
282e8d8bef9SDimitry Andric   return __builtin_ia32_tilestored64_internal(m, n, base,
283e8d8bef9SDimitry Andric                                               (__SIZE_TYPE__)(stride), tile);
284e8d8bef9SDimitry Andric }
285e8d8bef9SDimitry Andric 
286*fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly.
287*fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
288*fe6060f1SDimitry Andric _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
289*fe6060f1SDimitry Andric                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
290*fe6060f1SDimitry Andric   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
291*fe6060f1SDimitry Andric }
292*fe6060f1SDimitry Andric 
293*fe6060f1SDimitry Andric /// This struct pack the shape and tile data together for user. We suggest
294*fe6060f1SDimitry Andric /// initializing the struct as early as possible, because compiler depends
295*fe6060f1SDimitry Andric /// on the shape information to do configure. The constant value is preferred
296*fe6060f1SDimitry Andric /// for optimization by compiler.
297e8d8bef9SDimitry Andric typedef struct __tile1024i_str {
298e8d8bef9SDimitry Andric   const unsigned short row;
299e8d8bef9SDimitry Andric   const unsigned short col;
300e8d8bef9SDimitry Andric   _tile1024i tile;
301e8d8bef9SDimitry Andric } __tile1024i;
302e8d8bef9SDimitry Andric 
303*fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
304*fe6060f1SDimitry Andric /// destination tile "dst".
305*fe6060f1SDimitry Andric ///
306*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
307*fe6060f1SDimitry Andric ///
308*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
309*fe6060f1SDimitry Andric ///
310*fe6060f1SDimitry Andric /// \param dst
311*fe6060f1SDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
312*fe6060f1SDimitry Andric /// \param base
313*fe6060f1SDimitry Andric ///    A pointer to base address.
314*fe6060f1SDimitry Andric /// \param stride
315*fe6060f1SDimitry Andric ///    The stride between the rows' data to be loaded in memory.
316e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
317e8d8bef9SDimitry Andric static void __tile_loadd(__tile1024i *dst, const void *base,
318e8d8bef9SDimitry Andric                          __SIZE_TYPE__ stride) {
319e8d8bef9SDimitry Andric   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
320e8d8bef9SDimitry Andric }
321e8d8bef9SDimitry Andric 
322*fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into
323*fe6060f1SDimitry Andric /// destination tile "dst". This intrinsic provides a hint to the implementation
324*fe6060f1SDimitry Andric /// that the data will likely not be reused in the near future and the data
325*fe6060f1SDimitry Andric /// caching can be optimized accordingly.
326*fe6060f1SDimitry Andric ///
327*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
328*fe6060f1SDimitry Andric ///
329*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
330*fe6060f1SDimitry Andric ///
331*fe6060f1SDimitry Andric /// \param dst
332*fe6060f1SDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
333*fe6060f1SDimitry Andric /// \param base
334*fe6060f1SDimitry Andric ///    A pointer to base address.
335*fe6060f1SDimitry Andric /// \param stride
336*fe6060f1SDimitry Andric ///    The stride between the rows' data to be loaded in memory.
337*fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_TILE
338*fe6060f1SDimitry Andric static void __tile_stream_loadd(__tile1024i *dst, const void *base,
339*fe6060f1SDimitry Andric                                 __SIZE_TYPE__ stride) {
340*fe6060f1SDimitry Andric   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
341e8d8bef9SDimitry Andric }
342e8d8bef9SDimitry Andric 
343*fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
344*fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
345*fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
346*fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
347*fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst".
348*fe6060f1SDimitry Andric ///
349*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
350*fe6060f1SDimitry Andric ///
351*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
352*fe6060f1SDimitry Andric ///
353*fe6060f1SDimitry Andric /// \param dst
354*fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
355*fe6060f1SDimitry Andric /// \param src0
356*fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
357*fe6060f1SDimitry Andric /// \param src1
358*fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
359*fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
360*fe6060f1SDimitry Andric static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
361*fe6060f1SDimitry Andric                           __tile1024i src1) {
362*fe6060f1SDimitry Andric   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
363*fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
364*fe6060f1SDimitry Andric }
365*fe6060f1SDimitry Andric 
366*fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
367*fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
368*fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
369*fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
370*fe6060f1SDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst".
371*fe6060f1SDimitry Andric ///
372*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
373*fe6060f1SDimitry Andric ///
374*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
375*fe6060f1SDimitry Andric ///
376*fe6060f1SDimitry Andric /// \param dst
377*fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
378*fe6060f1SDimitry Andric /// \param src0
379*fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
380*fe6060f1SDimitry Andric /// \param src1
381*fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
382*fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
383*fe6060f1SDimitry Andric static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
384*fe6060f1SDimitry Andric                           __tile1024i src1) {
385*fe6060f1SDimitry Andric   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
386*fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
387*fe6060f1SDimitry Andric }
388*fe6060f1SDimitry Andric 
389*fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
390*fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
391*fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
392*fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
393*fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst".
394*fe6060f1SDimitry Andric ///
395*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
396*fe6060f1SDimitry Andric ///
397*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
398*fe6060f1SDimitry Andric ///
399*fe6060f1SDimitry Andric /// \param dst
400*fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
401*fe6060f1SDimitry Andric /// \param src0
402*fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
403*fe6060f1SDimitry Andric /// \param src1
404*fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
405*fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
406*fe6060f1SDimitry Andric static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
407*fe6060f1SDimitry Andric                           __tile1024i src1) {
408*fe6060f1SDimitry Andric   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
409*fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
410*fe6060f1SDimitry Andric }
411*fe6060f1SDimitry Andric 
412*fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator.
413*fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
414*fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
415*fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
416*fe6060f1SDimitry Andric /// "dst", and store the 32-bit result back to tile "dst".
417*fe6060f1SDimitry Andric ///
418*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
419*fe6060f1SDimitry Andric ///
420*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
421*fe6060f1SDimitry Andric ///
422*fe6060f1SDimitry Andric /// \param dst
423*fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
424*fe6060f1SDimitry Andric /// \param src0
425*fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
426*fe6060f1SDimitry Andric /// \param src1
427*fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
428*fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8
429*fe6060f1SDimitry Andric static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
430*fe6060f1SDimitry Andric                           __tile1024i src1) {
431*fe6060f1SDimitry Andric   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
432*fe6060f1SDimitry Andric                                     src0.tile, src1.tile);
433*fe6060f1SDimitry Andric }
434*fe6060f1SDimitry Andric 
435*fe6060f1SDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and
436*fe6060f1SDimitry Andric /// "stride".
437*fe6060f1SDimitry Andric ///
438*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
439*fe6060f1SDimitry Andric ///
440*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
441*fe6060f1SDimitry Andric ///
442*fe6060f1SDimitry Andric /// \param dst
443*fe6060f1SDimitry Andric ///    A destination tile. Max size is 1024 Bytes.
444*fe6060f1SDimitry Andric /// \param base
445*fe6060f1SDimitry Andric ///    A pointer to base address.
446*fe6060f1SDimitry Andric /// \param stride
447*fe6060f1SDimitry Andric ///    The stride between the rows' data to be stored in memory.
448e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
449e8d8bef9SDimitry Andric static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
450e8d8bef9SDimitry Andric   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
451e8d8bef9SDimitry Andric }
452e8d8bef9SDimitry Andric 
453*fe6060f1SDimitry Andric /// Zero the tile specified by "dst".
454*fe6060f1SDimitry Andric ///
455*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
456*fe6060f1SDimitry Andric ///
457*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
458*fe6060f1SDimitry Andric ///
459*fe6060f1SDimitry Andric /// \param dst
460*fe6060f1SDimitry Andric ///    The destination tile to be zero. Max size is 1024 Bytes.
461e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE
462e8d8bef9SDimitry Andric static void __tile_zero(__tile1024i *dst) {
463e8d8bef9SDimitry Andric   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
464e8d8bef9SDimitry Andric }
4655ffd83dbSDimitry Andric 
466*fe6060f1SDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
467*fe6060f1SDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point
468*fe6060f1SDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile
469*fe6060f1SDimitry Andric /// "dst".
470*fe6060f1SDimitry Andric ///
471*fe6060f1SDimitry Andric /// \headerfile <immintrin.h>
472*fe6060f1SDimitry Andric ///
473*fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
474*fe6060f1SDimitry Andric ///
475*fe6060f1SDimitry Andric /// \param dst
476*fe6060f1SDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
477*fe6060f1SDimitry Andric /// \param src0
478*fe6060f1SDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
479*fe6060f1SDimitry Andric /// \param src1
480*fe6060f1SDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
481*fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_BF16
482*fe6060f1SDimitry Andric static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
483*fe6060f1SDimitry Andric                             __tile1024i src1) {
484*fe6060f1SDimitry Andric   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
485*fe6060f1SDimitry Andric                                       src0.tile, src1.tile);
486*fe6060f1SDimitry Andric }
487*fe6060f1SDimitry Andric 
488*fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_TILE
489*fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_INT8
490*fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_BF16
491*fe6060f1SDimitry Andric 
4925ffd83dbSDimitry Andric #endif /* __x86_64__ */
4935ffd83dbSDimitry Andric #endif /* __AMXINTRIN_H */
494