1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 12 #endif /* __IMMINTRIN_H */ 13 14 #ifndef __AMXINTRIN_H 15 #define __AMXINTRIN_H 16 #ifdef __x86_64__ 17 18 #define __DEFAULT_FN_ATTRS_TILE \ 19 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 20 21 /// Load tile configuration from a 64-byte memory location specified by 22 /// "mem_addr". The tile configuration includes the tile type palette, the 23 /// number of bytes per row, and the number of rows. If the specified 24 /// palette_id is zero, that signifies the init state for both the tile 25 /// config and the tile data, and the tiles are zeroed. Any invalid 26 /// configurations will result in #GP fault. 27 /// 28 /// \headerfile <x86intrin.h> 29 /// 30 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 31 /// 32 /// \param __config 33 /// A pointer to 512-bits configuration 34 static __inline__ void __DEFAULT_FN_ATTRS_TILE 35 _tile_loadconfig(const void *__config) { 36 __builtin_ia32_tile_loadconfig(__config); 37 } 38 39 /// Stores the current tile configuration to a 64-byte memory location 40 /// specified by "mem_addr". The tile configuration includes the tile type 41 /// palette, the number of bytes per row, and the number of rows. If tiles 42 /// are not configured, all zeroes will be stored to memory. 43 /// 44 /// \headerfile <x86intrin.h> 45 /// 46 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 47 /// 48 /// \param __config 49 /// A pointer to 512-bits configuration 50 static __inline__ void __DEFAULT_FN_ATTRS_TILE 51 _tile_storeconfig(void *__config) { 52 __builtin_ia32_tile_storeconfig(__config); 53 } 54 55 /// Release the tile configuration to return to the init state, which 56 /// releases all storage it currently holds. 57 /// 58 /// \headerfile <x86intrin.h> 59 /// 60 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 61 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { 62 __builtin_ia32_tilerelease(); 63 } 64 65 /// Load tile rows from memory specifieid by "base" address and "stride" into 66 /// destination tile "dst" using the tile configuration previously configured 67 /// via "_tile_loadconfig". 68 /// 69 /// \headerfile <x86intrin.h> 70 /// 71 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 72 /// 73 /// \param dst 74 /// A destination tile. Max size is 1024 Bytes. 75 /// \param base 76 /// A pointer to base address. 77 /// \param stride 78 /// The stride between the rows' data to be loaded in memory. 79 #define _tile_loadd(dst, base, stride) \ 80 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ 81 (__SIZE_TYPE__)(stride)) 82 83 /// Load tile rows from memory specifieid by "base" address and "stride" into 84 /// destination tile "dst" using the tile configuration previously configured 85 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 86 /// that the data will likely not be reused in the near future and the data 87 /// caching can be optimized accordingly. 88 /// 89 /// \headerfile <x86intrin.h> 90 /// 91 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 92 /// 93 /// \param dst 94 /// A destination tile. Max size is 1024 Bytes. 95 /// \param base 96 /// A pointer to base address. 97 /// \param stride 98 /// The stride between the rows' data to be loaded in memory. 99 #define _tile_stream_loadd(dst, base, stride) \ 100 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ 101 (__SIZE_TYPE__)(stride)) 102 103 /// Store the tile specified by "src" to memory specifieid by "base" address and 104 /// "stride" using the tile configuration previously configured via 105 /// "_tile_loadconfig". 106 /// 107 /// \headerfile <x86intrin.h> 108 /// 109 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 110 /// 111 /// \param dst 112 /// A destination tile. Max size is 1024 Bytes. 113 /// \param base 114 /// A pointer to base address. 115 /// \param stride 116 /// The stride between the rows' data to be stored in memory. 117 #define _tile_stored(dst, base, stride) \ 118 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 119 120 /// Zero the tile specified by "tdest". 121 /// 122 /// \headerfile <x86intrin.h> 123 /// 124 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 125 /// 126 /// \param tile 127 /// The destination tile to be zero. Max size is 1024 Bytes. 128 #define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 129 130 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 131 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 132 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 133 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 134 /// and store the 32-bit result back to tile "dst". 135 /// 136 /// \headerfile <x86intrin.h> 137 /// 138 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 139 /// 140 /// \param dst 141 /// The destination tile. Max size is 1024 Bytes. 142 /// \param src0 143 /// The 1st source tile. Max size is 1024 Bytes. 144 /// \param src1 145 /// The 2nd source tile. Max size is 1024 Bytes. 146 #define _tile_dpbssd(dst, src0, src1) \ 147 __builtin_ia32_tdpbssd((dst), (src0), (src1)) 148 149 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 150 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 151 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 152 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 153 /// in "dst", and store the 32-bit result back to tile "dst". 154 /// 155 /// \headerfile <x86intrin.h> 156 /// 157 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 158 /// 159 /// \param dst 160 /// The destination tile. Max size is 1024 Bytes. 161 /// \param src0 162 /// The 1st source tile. Max size is 1024 Bytes. 163 /// \param src1 164 /// The 2nd source tile. Max size is 1024 Bytes. 165 #define _tile_dpbsud(dst, src0, src1) \ 166 __builtin_ia32_tdpbsud((dst), (src0), (src1)) 167 168 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 169 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 170 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 171 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 172 /// and store the 32-bit result back to tile "dst". 173 /// 174 /// \headerfile <x86intrin.h> 175 /// 176 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 177 /// 178 /// \param dst 179 /// The destination tile. Max size is 1024 Bytes. 180 /// \param src0 181 /// The 1st source tile. Max size is 1024 Bytes. 182 /// \param src1 183 /// The 2nd source tile. Max size is 1024 Bytes. 184 #define _tile_dpbusd(dst, src0, src1) \ 185 __builtin_ia32_tdpbusd((dst), (src0), (src1)) 186 187 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 188 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 189 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 190 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 191 /// "dst", and store the 32-bit result back to tile "dst". 192 /// 193 /// \headerfile <x86intrin.h> 194 /// 195 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 196 /// 197 /// \param dst 198 /// The destination tile. Max size is 1024 Bytes. 199 /// \param src0 200 /// The 1st source tile. Max size is 1024 Bytes. 201 /// \param src1 202 /// The 2nd source tile. Max size is 1024 Bytes. 203 #define _tile_dpbuud(dst, src0, src1) \ 204 __builtin_ia32_tdpbuud((dst), (src0), (src1)) 205 206 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 207 /// src1, accumulating the intermediate single-precision (32-bit) floating-point 208 /// elements with elements in "dst", and store the 32-bit result back to tile 209 /// "dst". 210 /// 211 /// \headerfile <x86intrin.h> 212 /// 213 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 214 /// 215 /// \param dst 216 /// The destination tile. Max size is 1024 Bytes. 217 /// \param src0 218 /// The 1st source tile. Max size is 1024 Bytes. 219 /// \param src1 220 /// The 2nd source tile. Max size is 1024 Bytes. 221 #define _tile_dpbf16ps(dst, src0, src1) \ 222 __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 223 224 #define __DEFAULT_FN_ATTRS_INT8 \ 225 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) 226 227 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); 228 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 229 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, 230 __SIZE_TYPE__ stride) { 231 return __builtin_ia32_tileloadd64_internal(m, n, base, 232 (__SIZE_TYPE__)(stride)); 233 } 234 235 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 236 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, 237 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 238 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); 239 } 240 241 static __inline__ void __DEFAULT_FN_ATTRS_INT8 242 _tile_stored_internal(unsigned short m, unsigned short n, void *base, 243 __SIZE_TYPE__ stride, _tile1024i tile) { 244 return __builtin_ia32_tilestored64_internal(m, n, base, 245 (__SIZE_TYPE__)(stride), tile); 246 } 247 248 typedef struct __tile1024i_str { 249 const unsigned short row; 250 const unsigned short col; 251 _tile1024i tile; 252 } __tile1024i; 253 254 __DEFAULT_FN_ATTRS_TILE 255 static void __tile_loadd(__tile1024i *dst, const void *base, 256 __SIZE_TYPE__ stride) { 257 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); 258 } 259 260 __DEFAULT_FN_ATTRS_INT8 261 static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1, 262 __tile1024i src2) { 263 dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile, 264 src1.tile, src2.tile); 265 } 266 267 __DEFAULT_FN_ATTRS_TILE 268 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { 269 _tile_stored_internal(src.row, src.col, base, stride, src.tile); 270 } 271 272 __DEFAULT_FN_ATTRS_TILE 273 static void __tile_zero(__tile1024i *dst) { 274 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); 275 } 276 277 #endif /* __x86_64__ */ 278 #endif /* __AMXINTRIN_H */ 279