15ffd83dbSDimitry Andric /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 25ffd83dbSDimitry Andric * 35ffd83dbSDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric * 75ffd83dbSDimitry Andric *===------------------------------------------------------------------------=== 85ffd83dbSDimitry Andric */ 95ffd83dbSDimitry Andric 105ffd83dbSDimitry Andric #ifndef __IMMINTRIN_H 115ffd83dbSDimitry Andric #error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 125ffd83dbSDimitry Andric #endif /* __IMMINTRIN_H */ 135ffd83dbSDimitry Andric 145ffd83dbSDimitry Andric #ifndef __AMXINTRIN_H 155ffd83dbSDimitry Andric #define __AMXINTRIN_H 165ffd83dbSDimitry Andric #ifdef __x86_64__ 175ffd83dbSDimitry Andric 18fe6060f1SDimitry Andric /* Define the default attributes for the functions in this file. */ 19e8d8bef9SDimitry Andric #define __DEFAULT_FN_ATTRS_TILE \ 205ffd83dbSDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 21fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_INT8 \ 22fe6060f1SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) 23fe6060f1SDimitry Andric #define __DEFAULT_FN_ATTRS_BF16 \ 24fe6060f1SDimitry Andric __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) 255ffd83dbSDimitry Andric 265ffd83dbSDimitry Andric /// Load tile configuration from a 64-byte memory location specified by 275ffd83dbSDimitry Andric /// "mem_addr". The tile configuration includes the tile type palette, the 285ffd83dbSDimitry Andric /// number of bytes per row, and the number of rows. If the specified 295ffd83dbSDimitry Andric /// palette_id is zero, that signifies the init state for both the tile 305ffd83dbSDimitry Andric /// config and the tile data, and the tiles are zeroed. Any invalid 315ffd83dbSDimitry Andric /// configurations will result in #GP fault. 325ffd83dbSDimitry Andric /// 33fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 345ffd83dbSDimitry Andric /// 355ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 365ffd83dbSDimitry Andric /// 375ffd83dbSDimitry Andric /// \param __config 385ffd83dbSDimitry Andric /// A pointer to 512-bits configuration 39e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE 40e8d8bef9SDimitry Andric _tile_loadconfig(const void *__config) { 415ffd83dbSDimitry Andric __builtin_ia32_tile_loadconfig(__config); 425ffd83dbSDimitry Andric } 435ffd83dbSDimitry Andric 445ffd83dbSDimitry Andric /// Stores the current tile configuration to a 64-byte memory location 455ffd83dbSDimitry Andric /// specified by "mem_addr". The tile configuration includes the tile type 465ffd83dbSDimitry Andric /// palette, the number of bytes per row, and the number of rows. If tiles 475ffd83dbSDimitry Andric /// are not configured, all zeroes will be stored to memory. 485ffd83dbSDimitry Andric /// 49fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 505ffd83dbSDimitry Andric /// 515ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 525ffd83dbSDimitry Andric /// 535ffd83dbSDimitry Andric /// \param __config 545ffd83dbSDimitry Andric /// A pointer to 512-bits configuration 55e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE 56e8d8bef9SDimitry Andric _tile_storeconfig(void *__config) { 575ffd83dbSDimitry Andric __builtin_ia32_tile_storeconfig(__config); 585ffd83dbSDimitry Andric } 595ffd83dbSDimitry Andric 605ffd83dbSDimitry Andric /// Release the tile configuration to return to the init state, which 615ffd83dbSDimitry Andric /// releases all storage it currently holds. 625ffd83dbSDimitry Andric /// 63fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 645ffd83dbSDimitry Andric /// 655ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 66e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { 675ffd83dbSDimitry Andric __builtin_ia32_tilerelease(); 685ffd83dbSDimitry Andric } 695ffd83dbSDimitry Andric 705ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into 715ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured 725ffd83dbSDimitry Andric /// via "_tile_loadconfig". 735ffd83dbSDimitry Andric /// 74fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 755ffd83dbSDimitry Andric /// 765ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 775ffd83dbSDimitry Andric /// 785ffd83dbSDimitry Andric /// \param dst 795ffd83dbSDimitry Andric /// A destination tile. Max size is 1024 Bytes. 805ffd83dbSDimitry Andric /// \param base 815ffd83dbSDimitry Andric /// A pointer to base address. 825ffd83dbSDimitry Andric /// \param stride 835ffd83dbSDimitry Andric /// The stride between the rows' data to be loaded in memory. 845ffd83dbSDimitry Andric #define _tile_loadd(dst, base, stride) \ 85e8d8bef9SDimitry Andric __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ 86e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride)) 875ffd83dbSDimitry Andric 885ffd83dbSDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into 895ffd83dbSDimitry Andric /// destination tile "dst" using the tile configuration previously configured 905ffd83dbSDimitry Andric /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 915ffd83dbSDimitry Andric /// that the data will likely not be reused in the near future and the data 925ffd83dbSDimitry Andric /// caching can be optimized accordingly. 935ffd83dbSDimitry Andric /// 94fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 955ffd83dbSDimitry Andric /// 965ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 975ffd83dbSDimitry Andric /// 985ffd83dbSDimitry Andric /// \param dst 995ffd83dbSDimitry Andric /// A destination tile. Max size is 1024 Bytes. 1005ffd83dbSDimitry Andric /// \param base 1015ffd83dbSDimitry Andric /// A pointer to base address. 1025ffd83dbSDimitry Andric /// \param stride 1035ffd83dbSDimitry Andric /// The stride between the rows' data to be loaded in memory. 1045ffd83dbSDimitry Andric #define _tile_stream_loadd(dst, base, stride) \ 105e8d8bef9SDimitry Andric __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ 106e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride)) 1075ffd83dbSDimitry Andric 1085ffd83dbSDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and 1095ffd83dbSDimitry Andric /// "stride" using the tile configuration previously configured via 1105ffd83dbSDimitry Andric /// "_tile_loadconfig". 1115ffd83dbSDimitry Andric /// 112fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 1135ffd83dbSDimitry Andric /// 1145ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 1155ffd83dbSDimitry Andric /// 1165ffd83dbSDimitry Andric /// \param dst 1175ffd83dbSDimitry Andric /// A destination tile. Max size is 1024 Bytes. 1185ffd83dbSDimitry Andric /// \param base 1195ffd83dbSDimitry Andric /// A pointer to base address. 1205ffd83dbSDimitry Andric /// \param stride 1215ffd83dbSDimitry Andric /// The stride between the rows' data to be stored in memory. 1225ffd83dbSDimitry Andric #define _tile_stored(dst, base, stride) \ 1235ffd83dbSDimitry Andric __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 1245ffd83dbSDimitry Andric 1255ffd83dbSDimitry Andric /// Zero the tile specified by "tdest". 1265ffd83dbSDimitry Andric /// 127fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 1285ffd83dbSDimitry Andric /// 1295ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 1305ffd83dbSDimitry Andric /// 1315ffd83dbSDimitry Andric /// \param tile 1325ffd83dbSDimitry Andric /// The destination tile to be zero. Max size is 1024 Bytes. 1335ffd83dbSDimitry Andric #define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 1345ffd83dbSDimitry Andric 1355ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 1365ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 1375ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 1385ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 1395ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst". 1405ffd83dbSDimitry Andric /// 141fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 1425ffd83dbSDimitry Andric /// 1435ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 1445ffd83dbSDimitry Andric /// 1455ffd83dbSDimitry Andric /// \param dst 1465ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes. 1475ffd83dbSDimitry Andric /// \param src0 1485ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 1495ffd83dbSDimitry Andric /// \param src1 1505ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 151e8d8bef9SDimitry Andric #define _tile_dpbssd(dst, src0, src1) \ 152e8d8bef9SDimitry Andric __builtin_ia32_tdpbssd((dst), (src0), (src1)) 1535ffd83dbSDimitry Andric 1545ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 1555ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 1565ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 1575ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 1585ffd83dbSDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst". 1595ffd83dbSDimitry Andric /// 160fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 1615ffd83dbSDimitry Andric /// 1625ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 1635ffd83dbSDimitry Andric /// 1645ffd83dbSDimitry Andric /// \param dst 1655ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes. 1665ffd83dbSDimitry Andric /// \param src0 1675ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 1685ffd83dbSDimitry Andric /// \param src1 1695ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 170e8d8bef9SDimitry Andric #define _tile_dpbsud(dst, src0, src1) \ 171e8d8bef9SDimitry Andric __builtin_ia32_tdpbsud((dst), (src0), (src1)) 1725ffd83dbSDimitry Andric 1735ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 1745ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 1755ffd83dbSDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 1765ffd83dbSDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 1775ffd83dbSDimitry Andric /// and store the 32-bit result back to tile "dst". 1785ffd83dbSDimitry Andric /// 179fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 1805ffd83dbSDimitry Andric /// 1815ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 1825ffd83dbSDimitry Andric /// 1835ffd83dbSDimitry Andric /// \param dst 1845ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes. 1855ffd83dbSDimitry Andric /// \param src0 1865ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 1875ffd83dbSDimitry Andric /// \param src1 1885ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 189e8d8bef9SDimitry Andric #define _tile_dpbusd(dst, src0, src1) \ 190e8d8bef9SDimitry Andric __builtin_ia32_tdpbusd((dst), (src0), (src1)) 1915ffd83dbSDimitry Andric 1925ffd83dbSDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 1935ffd83dbSDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 1945ffd83dbSDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 1955ffd83dbSDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 1965ffd83dbSDimitry Andric /// "dst", and store the 32-bit result back to tile "dst". 1975ffd83dbSDimitry Andric /// 198fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 1995ffd83dbSDimitry Andric /// 2005ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 2015ffd83dbSDimitry Andric /// 2025ffd83dbSDimitry Andric /// \param dst 2035ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes. 2045ffd83dbSDimitry Andric /// \param src0 2055ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 2065ffd83dbSDimitry Andric /// \param src1 2075ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 208e8d8bef9SDimitry Andric #define _tile_dpbuud(dst, src0, src1) \ 209e8d8bef9SDimitry Andric __builtin_ia32_tdpbuud((dst), (src0), (src1)) 2105ffd83dbSDimitry Andric 2115ffd83dbSDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 2125ffd83dbSDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point 2135ffd83dbSDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile 2145ffd83dbSDimitry Andric /// "dst". 2155ffd83dbSDimitry Andric /// 216fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 2175ffd83dbSDimitry Andric /// 2185ffd83dbSDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 2195ffd83dbSDimitry Andric /// 2205ffd83dbSDimitry Andric /// \param dst 2215ffd83dbSDimitry Andric /// The destination tile. Max size is 1024 Bytes. 2225ffd83dbSDimitry Andric /// \param src0 2235ffd83dbSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 2245ffd83dbSDimitry Andric /// \param src1 2255ffd83dbSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 2265ffd83dbSDimitry Andric #define _tile_dpbf16ps(dst, src0, src1) \ 2275ffd83dbSDimitry Andric __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 2285ffd83dbSDimitry Andric 229fe6060f1SDimitry Andric /// AMX tile register size can be configured, the maximum size is 16x64=1024 230fe6060f1SDimitry Andric /// bytes. Since there is no 2D type in llvm IR, we use vector type to 231fe6060f1SDimitry Andric /// represent 2D tile and the fixed size is maximum amx tile register size. 232e8d8bef9SDimitry Andric typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); 233fe6060f1SDimitry Andric 234fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 235e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 236e8d8bef9SDimitry Andric _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, 237e8d8bef9SDimitry Andric __SIZE_TYPE__ stride) { 238e8d8bef9SDimitry Andric return __builtin_ia32_tileloadd64_internal(m, n, base, 239e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride)); 240e8d8bef9SDimitry Andric } 241e8d8bef9SDimitry Andric 242fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 243fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 244fe6060f1SDimitry Andric _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, 245fe6060f1SDimitry Andric __SIZE_TYPE__ stride) { 246fe6060f1SDimitry Andric return __builtin_ia32_tileloaddt164_internal(m, n, base, 247fe6060f1SDimitry Andric (__SIZE_TYPE__)(stride)); 248fe6060f1SDimitry Andric } 249fe6060f1SDimitry Andric 250fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 251e8d8bef9SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 252e8d8bef9SDimitry Andric _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, 253e8d8bef9SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) { 254e8d8bef9SDimitry Andric return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); 255e8d8bef9SDimitry Andric } 256e8d8bef9SDimitry Andric 257fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 258fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 259fe6060f1SDimitry Andric _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, 260fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) { 261fe6060f1SDimitry Andric return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); 262fe6060f1SDimitry Andric } 263fe6060f1SDimitry Andric 264fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 265fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 266fe6060f1SDimitry Andric _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, 267fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) { 268fe6060f1SDimitry Andric return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); 269fe6060f1SDimitry Andric } 270fe6060f1SDimitry Andric 271fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 272fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 273fe6060f1SDimitry Andric _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, 274fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) { 275fe6060f1SDimitry Andric return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); 276fe6060f1SDimitry Andric } 277fe6060f1SDimitry Andric 278fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 279e8d8bef9SDimitry Andric static __inline__ void __DEFAULT_FN_ATTRS_INT8 280e8d8bef9SDimitry Andric _tile_stored_internal(unsigned short m, unsigned short n, void *base, 281e8d8bef9SDimitry Andric __SIZE_TYPE__ stride, _tile1024i tile) { 282e8d8bef9SDimitry Andric return __builtin_ia32_tilestored64_internal(m, n, base, 283e8d8bef9SDimitry Andric (__SIZE_TYPE__)(stride), tile); 284e8d8bef9SDimitry Andric } 285e8d8bef9SDimitry Andric 286fe6060f1SDimitry Andric /// This is internal intrinsic. C/C++ user should avoid calling it directly. 287fe6060f1SDimitry Andric static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 288fe6060f1SDimitry Andric _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, 289fe6060f1SDimitry Andric _tile1024i dst, _tile1024i src1, _tile1024i src2) { 290fe6060f1SDimitry Andric return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); 291fe6060f1SDimitry Andric } 292fe6060f1SDimitry Andric 293fe6060f1SDimitry Andric /// This struct pack the shape and tile data together for user. We suggest 294fe6060f1SDimitry Andric /// initializing the struct as early as possible, because compiler depends 295fe6060f1SDimitry Andric /// on the shape information to do configure. The constant value is preferred 296fe6060f1SDimitry Andric /// for optimization by compiler. 297e8d8bef9SDimitry Andric typedef struct __tile1024i_str { 298e8d8bef9SDimitry Andric const unsigned short row; 299e8d8bef9SDimitry Andric const unsigned short col; 300e8d8bef9SDimitry Andric _tile1024i tile; 301e8d8bef9SDimitry Andric } __tile1024i; 302e8d8bef9SDimitry Andric 303fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into 304fe6060f1SDimitry Andric /// destination tile "dst". 305fe6060f1SDimitry Andric /// 306fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 307fe6060f1SDimitry Andric /// 308fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 309fe6060f1SDimitry Andric /// 310fe6060f1SDimitry Andric /// \param dst 311fe6060f1SDimitry Andric /// A destination tile. Max size is 1024 Bytes. 312fe6060f1SDimitry Andric /// \param base 313fe6060f1SDimitry Andric /// A pointer to base address. 314fe6060f1SDimitry Andric /// \param stride 315fe6060f1SDimitry Andric /// The stride between the rows' data to be loaded in memory. 316e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE 317*349cc55cSDimitry Andric static __inline__ void __tile_loadd(__tile1024i *dst, const void *base, 318e8d8bef9SDimitry Andric __SIZE_TYPE__ stride) { 319e8d8bef9SDimitry Andric dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); 320e8d8bef9SDimitry Andric } 321e8d8bef9SDimitry Andric 322fe6060f1SDimitry Andric /// Load tile rows from memory specifieid by "base" address and "stride" into 323fe6060f1SDimitry Andric /// destination tile "dst". This intrinsic provides a hint to the implementation 324fe6060f1SDimitry Andric /// that the data will likely not be reused in the near future and the data 325fe6060f1SDimitry Andric /// caching can be optimized accordingly. 326fe6060f1SDimitry Andric /// 327fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 328fe6060f1SDimitry Andric /// 329fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 330fe6060f1SDimitry Andric /// 331fe6060f1SDimitry Andric /// \param dst 332fe6060f1SDimitry Andric /// A destination tile. Max size is 1024 Bytes. 333fe6060f1SDimitry Andric /// \param base 334fe6060f1SDimitry Andric /// A pointer to base address. 335fe6060f1SDimitry Andric /// \param stride 336fe6060f1SDimitry Andric /// The stride between the rows' data to be loaded in memory. 337fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_TILE 338*349cc55cSDimitry Andric static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base, 339fe6060f1SDimitry Andric __SIZE_TYPE__ stride) { 340fe6060f1SDimitry Andric dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); 341e8d8bef9SDimitry Andric } 342e8d8bef9SDimitry Andric 343fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 344fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 345fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 346fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 347fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst". 348fe6060f1SDimitry Andric /// 349fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 350fe6060f1SDimitry Andric /// 351fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 352fe6060f1SDimitry Andric /// 353fe6060f1SDimitry Andric /// \param dst 354fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes. 355fe6060f1SDimitry Andric /// \param src0 356fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 357fe6060f1SDimitry Andric /// \param src1 358fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 359fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8 360*349cc55cSDimitry Andric static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0, 361fe6060f1SDimitry Andric __tile1024i src1) { 362fe6060f1SDimitry Andric dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, 363fe6060f1SDimitry Andric src0.tile, src1.tile); 364fe6060f1SDimitry Andric } 365fe6060f1SDimitry Andric 366fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 367fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 368fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 369fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 370fe6060f1SDimitry Andric /// in "dst", and store the 32-bit result back to tile "dst". 371fe6060f1SDimitry Andric /// 372fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 373fe6060f1SDimitry Andric /// 374fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 375fe6060f1SDimitry Andric /// 376fe6060f1SDimitry Andric /// \param dst 377fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes. 378fe6060f1SDimitry Andric /// \param src0 379fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 380fe6060f1SDimitry Andric /// \param src1 381fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 382fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8 383*349cc55cSDimitry Andric static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0, 384fe6060f1SDimitry Andric __tile1024i src1) { 385fe6060f1SDimitry Andric dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, 386fe6060f1SDimitry Andric src0.tile, src1.tile); 387fe6060f1SDimitry Andric } 388fe6060f1SDimitry Andric 389fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 390fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 391fe6060f1SDimitry Andric /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 392fe6060f1SDimitry Andric /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 393fe6060f1SDimitry Andric /// and store the 32-bit result back to tile "dst". 394fe6060f1SDimitry Andric /// 395fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 396fe6060f1SDimitry Andric /// 397fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 398fe6060f1SDimitry Andric /// 399fe6060f1SDimitry Andric /// \param dst 400fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes. 401fe6060f1SDimitry Andric /// \param src0 402fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 403fe6060f1SDimitry Andric /// \param src1 404fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 405fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8 406*349cc55cSDimitry Andric static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0, 407fe6060f1SDimitry Andric __tile1024i src1) { 408fe6060f1SDimitry Andric dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, 409fe6060f1SDimitry Andric src0.tile, src1.tile); 410fe6060f1SDimitry Andric } 411fe6060f1SDimitry Andric 412fe6060f1SDimitry Andric /// Compute dot-product of bytes in tiles with a source/destination accumulator. 413fe6060f1SDimitry Andric /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 414fe6060f1SDimitry Andric /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 415fe6060f1SDimitry Andric /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 416fe6060f1SDimitry Andric /// "dst", and store the 32-bit result back to tile "dst". 417fe6060f1SDimitry Andric /// 418fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 419fe6060f1SDimitry Andric /// 420fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 421fe6060f1SDimitry Andric /// 422fe6060f1SDimitry Andric /// \param dst 423fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes. 424fe6060f1SDimitry Andric /// \param src0 425fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 426fe6060f1SDimitry Andric /// \param src1 427fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 428fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_INT8 429*349cc55cSDimitry Andric static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0, 430fe6060f1SDimitry Andric __tile1024i src1) { 431fe6060f1SDimitry Andric dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, 432fe6060f1SDimitry Andric src0.tile, src1.tile); 433fe6060f1SDimitry Andric } 434fe6060f1SDimitry Andric 435fe6060f1SDimitry Andric /// Store the tile specified by "src" to memory specifieid by "base" address and 436fe6060f1SDimitry Andric /// "stride". 437fe6060f1SDimitry Andric /// 438fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 439fe6060f1SDimitry Andric /// 440fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 441fe6060f1SDimitry Andric /// 442fe6060f1SDimitry Andric /// \param dst 443fe6060f1SDimitry Andric /// A destination tile. Max size is 1024 Bytes. 444fe6060f1SDimitry Andric /// \param base 445fe6060f1SDimitry Andric /// A pointer to base address. 446fe6060f1SDimitry Andric /// \param stride 447fe6060f1SDimitry Andric /// The stride between the rows' data to be stored in memory. 448e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE 449*349cc55cSDimitry Andric static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride, 450*349cc55cSDimitry Andric __tile1024i src) { 451e8d8bef9SDimitry Andric _tile_stored_internal(src.row, src.col, base, stride, src.tile); 452e8d8bef9SDimitry Andric } 453e8d8bef9SDimitry Andric 454fe6060f1SDimitry Andric /// Zero the tile specified by "dst". 455fe6060f1SDimitry Andric /// 456fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 457fe6060f1SDimitry Andric /// 458fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 459fe6060f1SDimitry Andric /// 460fe6060f1SDimitry Andric /// \param dst 461fe6060f1SDimitry Andric /// The destination tile to be zero. Max size is 1024 Bytes. 462e8d8bef9SDimitry Andric __DEFAULT_FN_ATTRS_TILE 463*349cc55cSDimitry Andric static __inline__ void __tile_zero(__tile1024i *dst) { 464e8d8bef9SDimitry Andric dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); 465e8d8bef9SDimitry Andric } 4665ffd83dbSDimitry Andric 467fe6060f1SDimitry Andric /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 468fe6060f1SDimitry Andric /// src1, accumulating the intermediate single-precision (32-bit) floating-point 469fe6060f1SDimitry Andric /// elements with elements in "dst", and store the 32-bit result back to tile 470fe6060f1SDimitry Andric /// "dst". 471fe6060f1SDimitry Andric /// 472fe6060f1SDimitry Andric /// \headerfile <immintrin.h> 473fe6060f1SDimitry Andric /// 474fe6060f1SDimitry Andric /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 475fe6060f1SDimitry Andric /// 476fe6060f1SDimitry Andric /// \param dst 477fe6060f1SDimitry Andric /// The destination tile. Max size is 1024 Bytes. 478fe6060f1SDimitry Andric /// \param src0 479fe6060f1SDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 480fe6060f1SDimitry Andric /// \param src1 481fe6060f1SDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 482fe6060f1SDimitry Andric __DEFAULT_FN_ATTRS_BF16 483*349cc55cSDimitry Andric static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, 484fe6060f1SDimitry Andric __tile1024i src1) { 485fe6060f1SDimitry Andric dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, 486fe6060f1SDimitry Andric src0.tile, src1.tile); 487fe6060f1SDimitry Andric } 488fe6060f1SDimitry Andric 489fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_TILE 490fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_INT8 491fe6060f1SDimitry Andric #undef __DEFAULT_FN_ATTRS_BF16 492fe6060f1SDimitry Andric 4935ffd83dbSDimitry Andric #endif /* __x86_64__ */ 4945ffd83dbSDimitry Andric #endif /* __AMXINTRIN_H */ 495