1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 12 #endif /* __IMMINTRIN_H */ 13 14 #ifndef __AMXINTRIN_H 15 #define __AMXINTRIN_H 16 #ifdef __x86_64__ 17 18 #define __DEFAULT_FN_ATTRS \ 19 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 20 21 /// Load tile configuration from a 64-byte memory location specified by 22 /// "mem_addr". The tile configuration includes the tile type palette, the 23 /// number of bytes per row, and the number of rows. If the specified 24 /// palette_id is zero, that signifies the init state for both the tile 25 /// config and the tile data, and the tiles are zeroed. Any invalid 26 /// configurations will result in #GP fault. 27 /// 28 /// \headerfile <x86intrin.h> 29 /// 30 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 31 /// 32 /// \param __config 33 /// A pointer to 512-bits configuration 34 static __inline__ void __DEFAULT_FN_ATTRS 35 _tile_loadconfig(const void *__config) 36 { 37 __builtin_ia32_tile_loadconfig(__config); 38 } 39 40 /// Stores the current tile configuration to a 64-byte memory location 41 /// specified by "mem_addr". The tile configuration includes the tile type 42 /// palette, the number of bytes per row, and the number of rows. If tiles 43 /// are not configured, all zeroes will be stored to memory. 44 /// 45 /// \headerfile <x86intrin.h> 46 /// 47 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 48 /// 49 /// \param __config 50 /// A pointer to 512-bits configuration 51 static __inline__ void __DEFAULT_FN_ATTRS 52 _tile_storeconfig(void *__config) 53 { 54 __builtin_ia32_tile_storeconfig(__config); 55 } 56 57 /// Release the tile configuration to return to the init state, which 58 /// releases all storage it currently holds. 59 /// 60 /// \headerfile <x86intrin.h> 61 /// 62 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 63 static __inline__ void __DEFAULT_FN_ATTRS 64 _tile_release(void) 65 { 66 __builtin_ia32_tilerelease(); 67 } 68 69 /// Load tile rows from memory specifieid by "base" address and "stride" into 70 /// destination tile "dst" using the tile configuration previously configured 71 /// via "_tile_loadconfig". 72 /// 73 /// \headerfile <x86intrin.h> 74 /// 75 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 76 /// 77 /// \param dst 78 /// A destination tile. Max size is 1024 Bytes. 79 /// \param base 80 /// A pointer to base address. 81 /// \param stride 82 /// The stride between the rows' data to be loaded in memory. 83 #define _tile_loadd(dst, base, stride) \ 84 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) 85 86 /// Load tile rows from memory specifieid by "base" address and "stride" into 87 /// destination tile "dst" using the tile configuration previously configured 88 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 89 /// that the data will likely not be reused in the near future and the data 90 /// caching can be optimized accordingly. 91 /// 92 /// \headerfile <x86intrin.h> 93 /// 94 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 95 /// 96 /// \param dst 97 /// A destination tile. Max size is 1024 Bytes. 98 /// \param base 99 /// A pointer to base address. 100 /// \param stride 101 /// The stride between the rows' data to be loaded in memory. 102 #define _tile_stream_loadd(dst, base, stride) \ 103 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) 104 105 /// Store the tile specified by "src" to memory specifieid by "base" address and 106 /// "stride" using the tile configuration previously configured via 107 /// "_tile_loadconfig". 108 /// 109 /// \headerfile <x86intrin.h> 110 /// 111 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 112 /// 113 /// \param dst 114 /// A destination tile. Max size is 1024 Bytes. 115 /// \param base 116 /// A pointer to base address. 117 /// \param stride 118 /// The stride between the rows' data to be stored in memory. 119 #define _tile_stored(dst, base, stride) \ 120 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 121 122 /// Zero the tile specified by "tdest". 123 /// 124 /// \headerfile <x86intrin.h> 125 /// 126 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 127 /// 128 /// \param tile 129 /// The destination tile to be zero. Max size is 1024 Bytes. 130 #define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 131 132 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 133 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 134 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 135 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 136 /// and store the 32-bit result back to tile "dst". 137 /// 138 /// \headerfile <x86intrin.h> 139 /// 140 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 141 /// 142 /// \param dst 143 /// The destination tile. Max size is 1024 Bytes. 144 /// \param src0 145 /// The 1st source tile. Max size is 1024 Bytes. 146 /// \param src1 147 /// The 2nd source tile. Max size is 1024 Bytes. 148 #define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1)) 149 150 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 151 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 152 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 153 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 154 /// in "dst", and store the 32-bit result back to tile "dst". 155 /// 156 /// \headerfile <x86intrin.h> 157 /// 158 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 159 /// 160 /// \param dst 161 /// The destination tile. Max size is 1024 Bytes. 162 /// \param src0 163 /// The 1st source tile. Max size is 1024 Bytes. 164 /// \param src1 165 /// The 2nd source tile. Max size is 1024 Bytes. 166 #define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1)) 167 168 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 169 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 170 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 171 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 172 /// and store the 32-bit result back to tile "dst". 173 /// 174 /// \headerfile <x86intrin.h> 175 /// 176 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 177 /// 178 /// \param dst 179 /// The destination tile. Max size is 1024 Bytes. 180 /// \param src0 181 /// The 1st source tile. Max size is 1024 Bytes. 182 /// \param src1 183 /// The 2nd source tile. Max size is 1024 Bytes. 184 #define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1)) 185 186 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 187 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 188 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 189 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 190 /// "dst", and store the 32-bit result back to tile "dst". 191 /// 192 /// \headerfile <x86intrin.h> 193 /// 194 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 195 /// 196 /// \param dst 197 /// The destination tile. Max size is 1024 Bytes. 198 /// \param src0 199 /// The 1st source tile. Max size is 1024 Bytes. 200 /// \param src1 201 /// The 2nd source tile. Max size is 1024 Bytes. 202 #define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1)) 203 204 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 205 /// src1, accumulating the intermediate single-precision (32-bit) floating-point 206 /// elements with elements in "dst", and store the 32-bit result back to tile 207 /// "dst". 208 /// 209 /// \headerfile <x86intrin.h> 210 /// 211 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 212 /// 213 /// \param dst 214 /// The destination tile. Max size is 1024 Bytes. 215 /// \param src0 216 /// The 1st source tile. Max size is 1024 Bytes. 217 /// \param src1 218 /// The 2nd source tile. Max size is 1024 Bytes. 219 #define _tile_dpbf16ps(dst, src0, src1) \ 220 __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 221 222 #undef __DEFAULT_FN_ATTRS 223 224 #endif /* __x86_64__ */ 225 #endif /* __AMXINTRIN_H */ 226