1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead." 12 #endif /* __IMMINTRIN_H */ 13 14 #ifndef __AMXINTRIN_H 15 #define __AMXINTRIN_H 16 #ifdef __x86_64__ 17 18 /* Define the default attributes for the functions in this file. */ 19 #define __DEFAULT_FN_ATTRS_TILE \ 20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) 21 #define __DEFAULT_FN_ATTRS_INT8 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) 23 #define __DEFAULT_FN_ATTRS_BF16 \ 24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) 25 #define __DEFAULT_FN_ATTRS_FP16 \ 26 __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16"))) 27 28 /// Load tile configuration from a 64-byte memory location specified by 29 /// "mem_addr". The tile configuration includes the tile type palette, the 30 /// number of bytes per row, and the number of rows. If the specified 31 /// palette_id is zero, that signifies the init state for both the tile 32 /// config and the tile data, and the tiles are zeroed. Any invalid 33 /// configurations will result in #GP fault. 34 /// 35 /// \headerfile <immintrin.h> 36 /// 37 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction. 38 /// 39 /// \param __config 40 /// A pointer to 512-bits configuration 41 static __inline__ void __DEFAULT_FN_ATTRS_TILE 42 _tile_loadconfig(const void *__config) { 43 __builtin_ia32_tile_loadconfig(__config); 44 } 45 46 /// Stores the current tile configuration to a 64-byte memory location 47 /// specified by "mem_addr". The tile configuration includes the tile type 48 /// palette, the number of bytes per row, and the number of rows. If tiles 49 /// are not configured, all zeroes will be stored to memory. 50 /// 51 /// \headerfile <immintrin.h> 52 /// 53 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction. 54 /// 55 /// \param __config 56 /// A pointer to 512-bits configuration 57 static __inline__ void __DEFAULT_FN_ATTRS_TILE 58 _tile_storeconfig(void *__config) { 59 __builtin_ia32_tile_storeconfig(__config); 60 } 61 62 /// Release the tile configuration to return to the init state, which 63 /// releases all storage it currently holds. 64 /// 65 /// \headerfile <immintrin.h> 66 /// 67 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction. 68 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { 69 __builtin_ia32_tilerelease(); 70 } 71 72 /// Load tile rows from memory specifieid by "base" address and "stride" into 73 /// destination tile "dst" using the tile configuration previously configured 74 /// via "_tile_loadconfig". 75 /// 76 /// \headerfile <immintrin.h> 77 /// 78 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 79 /// 80 /// \param dst 81 /// A destination tile. Max size is 1024 Bytes. 82 /// \param base 83 /// A pointer to base address. 84 /// \param stride 85 /// The stride between the rows' data to be loaded in memory. 86 #define _tile_loadd(dst, base, stride) \ 87 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ 88 (__SIZE_TYPE__)(stride)) 89 90 /// Load tile rows from memory specifieid by "base" address and "stride" into 91 /// destination tile "dst" using the tile configuration previously configured 92 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation 93 /// that the data will likely not be reused in the near future and the data 94 /// caching can be optimized accordingly. 95 /// 96 /// \headerfile <immintrin.h> 97 /// 98 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 99 /// 100 /// \param dst 101 /// A destination tile. Max size is 1024 Bytes. 102 /// \param base 103 /// A pointer to base address. 104 /// \param stride 105 /// The stride between the rows' data to be loaded in memory. 106 #define _tile_stream_loadd(dst, base, stride) \ 107 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ 108 (__SIZE_TYPE__)(stride)) 109 110 /// Store the tile specified by "src" to memory specifieid by "base" address and 111 /// "stride" using the tile configuration previously configured via 112 /// "_tile_loadconfig". 113 /// 114 /// \headerfile <immintrin.h> 115 /// 116 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 117 /// 118 /// \param dst 119 /// A destination tile. Max size is 1024 Bytes. 120 /// \param base 121 /// A pointer to base address. 122 /// \param stride 123 /// The stride between the rows' data to be stored in memory. 124 #define _tile_stored(dst, base, stride) \ 125 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) 126 127 /// Zero the tile specified by "tdest". 128 /// 129 /// \headerfile <immintrin.h> 130 /// 131 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 132 /// 133 /// \param tile 134 /// The destination tile to be zero. Max size is 1024 Bytes. 135 #define _tile_zero(tile) __builtin_ia32_tilezero((tile)) 136 137 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 138 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 139 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 140 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 141 /// and store the 32-bit result back to tile "dst". 142 /// 143 /// \headerfile <immintrin.h> 144 /// 145 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 146 /// 147 /// \param dst 148 /// The destination tile. Max size is 1024 Bytes. 149 /// \param src0 150 /// The 1st source tile. Max size is 1024 Bytes. 151 /// \param src1 152 /// The 2nd source tile. Max size is 1024 Bytes. 153 #define _tile_dpbssd(dst, src0, src1) \ 154 __builtin_ia32_tdpbssd((dst), (src0), (src1)) 155 156 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 157 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 158 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 159 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 160 /// in "dst", and store the 32-bit result back to tile "dst". 161 /// 162 /// \headerfile <immintrin.h> 163 /// 164 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 165 /// 166 /// \param dst 167 /// The destination tile. Max size is 1024 Bytes. 168 /// \param src0 169 /// The 1st source tile. Max size is 1024 Bytes. 170 /// \param src1 171 /// The 2nd source tile. Max size is 1024 Bytes. 172 #define _tile_dpbsud(dst, src0, src1) \ 173 __builtin_ia32_tdpbsud((dst), (src0), (src1)) 174 175 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 176 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 177 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 178 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 179 /// and store the 32-bit result back to tile "dst". 180 /// 181 /// \headerfile <immintrin.h> 182 /// 183 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 184 /// 185 /// \param dst 186 /// The destination tile. Max size is 1024 Bytes. 187 /// \param src0 188 /// The 1st source tile. Max size is 1024 Bytes. 189 /// \param src1 190 /// The 2nd source tile. Max size is 1024 Bytes. 191 #define _tile_dpbusd(dst, src0, src1) \ 192 __builtin_ia32_tdpbusd((dst), (src0), (src1)) 193 194 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 195 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 196 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 197 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 198 /// "dst", and store the 32-bit result back to tile "dst". 199 /// 200 /// \headerfile <immintrin.h> 201 /// 202 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 203 /// 204 /// \param dst 205 /// The destination tile. Max size is 1024 Bytes. 206 /// \param src0 207 /// The 1st source tile. Max size is 1024 Bytes. 208 /// \param src1 209 /// The 2nd source tile. Max size is 1024 Bytes. 210 #define _tile_dpbuud(dst, src0, src1) \ 211 __builtin_ia32_tdpbuud((dst), (src0), (src1)) 212 213 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 214 /// src1, accumulating the intermediate single-precision (32-bit) floating-point 215 /// elements with elements in "dst", and store the 32-bit result back to tile 216 /// "dst". 217 /// 218 /// \headerfile <immintrin.h> 219 /// 220 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 221 /// 222 /// \param dst 223 /// The destination tile. Max size is 1024 Bytes. 224 /// \param src0 225 /// The 1st source tile. Max size is 1024 Bytes. 226 /// \param src1 227 /// The 2nd source tile. Max size is 1024 Bytes. 228 #define _tile_dpbf16ps(dst, src0, src1) \ 229 __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) 230 231 /// AMX tile register size can be configured, the maximum size is 16x64=1024 232 /// bytes. Since there is no 2D type in llvm IR, we use vector type to 233 /// represent 2D tile and the fixed size is maximum amx tile register size. 234 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); 235 236 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 237 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 238 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, 239 __SIZE_TYPE__ stride) { 240 return __builtin_ia32_tileloadd64_internal(m, n, base, 241 (__SIZE_TYPE__)(stride)); 242 } 243 244 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 245 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 246 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, 247 __SIZE_TYPE__ stride) { 248 return __builtin_ia32_tileloaddt164_internal(m, n, base, 249 (__SIZE_TYPE__)(stride)); 250 } 251 252 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 253 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 254 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, 255 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 256 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); 257 } 258 259 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 260 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 261 _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, 262 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 263 return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); 264 } 265 266 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 267 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 268 _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, 269 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 270 return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); 271 } 272 273 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 274 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 275 _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, 276 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 277 return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); 278 } 279 280 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 281 static __inline__ void __DEFAULT_FN_ATTRS_INT8 282 _tile_stored_internal(unsigned short m, unsigned short n, void *base, 283 __SIZE_TYPE__ stride, _tile1024i tile) { 284 return __builtin_ia32_tilestored64_internal(m, n, base, 285 (__SIZE_TYPE__)(stride), tile); 286 } 287 288 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 289 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 290 _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, 291 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 292 return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); 293 } 294 295 /// This is internal intrinsic. C/C++ user should avoid calling it directly. 296 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16 297 _tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, 298 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 299 return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2); 300 } 301 302 /// This struct pack the shape and tile data together for user. We suggest 303 /// initializing the struct as early as possible, because compiler depends 304 /// on the shape information to do configure. The constant value is preferred 305 /// for optimization by compiler. 306 typedef struct __tile1024i_str { 307 const unsigned short row; 308 const unsigned short col; 309 _tile1024i tile; 310 } __tile1024i; 311 312 /// Load tile rows from memory specifieid by "base" address and "stride" into 313 /// destination tile "dst". 314 /// 315 /// \headerfile <immintrin.h> 316 /// 317 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction. 318 /// 319 /// \param dst 320 /// A destination tile. Max size is 1024 Bytes. 321 /// \param base 322 /// A pointer to base address. 323 /// \param stride 324 /// The stride between the rows' data to be loaded in memory. 325 __DEFAULT_FN_ATTRS_TILE 326 static __inline__ void __tile_loadd(__tile1024i *dst, const void *base, 327 __SIZE_TYPE__ stride) { 328 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); 329 } 330 331 /// Load tile rows from memory specifieid by "base" address and "stride" into 332 /// destination tile "dst". This intrinsic provides a hint to the implementation 333 /// that the data will likely not be reused in the near future and the data 334 /// caching can be optimized accordingly. 335 /// 336 /// \headerfile <immintrin.h> 337 /// 338 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction. 339 /// 340 /// \param dst 341 /// A destination tile. Max size is 1024 Bytes. 342 /// \param base 343 /// A pointer to base address. 344 /// \param stride 345 /// The stride between the rows' data to be loaded in memory. 346 __DEFAULT_FN_ATTRS_TILE 347 static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base, 348 __SIZE_TYPE__ stride) { 349 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); 350 } 351 352 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 353 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 354 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 355 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 356 /// and store the 32-bit result back to tile "dst". 357 /// 358 /// \headerfile <immintrin.h> 359 /// 360 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction. 361 /// 362 /// \param dst 363 /// The destination tile. Max size is 1024 Bytes. 364 /// \param src0 365 /// The 1st source tile. Max size is 1024 Bytes. 366 /// \param src1 367 /// The 2nd source tile. Max size is 1024 Bytes. 368 __DEFAULT_FN_ATTRS_INT8 369 static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0, 370 __tile1024i src1) { 371 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, 372 src0.tile, src1.tile); 373 } 374 375 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 376 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with 377 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 378 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 379 /// in "dst", and store the 32-bit result back to tile "dst". 380 /// 381 /// \headerfile <immintrin.h> 382 /// 383 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction. 384 /// 385 /// \param dst 386 /// The destination tile. Max size is 1024 Bytes. 387 /// \param src0 388 /// The 1st source tile. Max size is 1024 Bytes. 389 /// \param src1 390 /// The 2nd source tile. Max size is 1024 Bytes. 391 __DEFAULT_FN_ATTRS_INT8 392 static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0, 393 __tile1024i src1) { 394 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, 395 src0.tile, src1.tile); 396 } 397 398 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 399 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 400 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit 401 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", 402 /// and store the 32-bit result back to tile "dst". 403 /// 404 /// \headerfile <immintrin.h> 405 /// 406 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction. 407 /// 408 /// \param dst 409 /// The destination tile. Max size is 1024 Bytes. 410 /// \param src0 411 /// The 1st source tile. Max size is 1024 Bytes. 412 /// \param src1 413 /// The 2nd source tile. Max size is 1024 Bytes. 414 __DEFAULT_FN_ATTRS_INT8 415 static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0, 416 __tile1024i src1) { 417 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, 418 src0.tile, src1.tile); 419 } 420 421 /// Compute dot-product of bytes in tiles with a source/destination accumulator. 422 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with 423 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate 424 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in 425 /// "dst", and store the 32-bit result back to tile "dst". 426 /// 427 /// \headerfile <immintrin.h> 428 /// 429 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction. 430 /// 431 /// \param dst 432 /// The destination tile. Max size is 1024 Bytes. 433 /// \param src0 434 /// The 1st source tile. Max size is 1024 Bytes. 435 /// \param src1 436 /// The 2nd source tile. Max size is 1024 Bytes. 437 __DEFAULT_FN_ATTRS_INT8 438 static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0, 439 __tile1024i src1) { 440 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, 441 src0.tile, src1.tile); 442 } 443 444 /// Store the tile specified by "src" to memory specifieid by "base" address and 445 /// "stride". 446 /// 447 /// \headerfile <immintrin.h> 448 /// 449 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction. 450 /// 451 /// \param base 452 /// A pointer to base address. 453 /// \param stride 454 /// The stride between the rows' data to be stored in memory. 455 __DEFAULT_FN_ATTRS_TILE 456 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride, 457 __tile1024i src) { 458 _tile_stored_internal(src.row, src.col, base, stride, src.tile); 459 } 460 461 /// Zero the tile specified by "dst". 462 /// 463 /// \headerfile <immintrin.h> 464 /// 465 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction. 466 /// 467 /// \param dst 468 /// The destination tile to be zero. Max size is 1024 Bytes. 469 __DEFAULT_FN_ATTRS_TILE 470 static __inline__ void __tile_zero(__tile1024i *dst) { 471 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); 472 } 473 474 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and 475 /// src1, accumulating the intermediate single-precision (32-bit) floating-point 476 /// elements with elements in "dst", and store the 32-bit result back to tile 477 /// "dst". 478 /// 479 /// \headerfile <immintrin.h> 480 /// 481 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction. 482 /// 483 /// \param dst 484 /// The destination tile. Max size is 1024 Bytes. 485 /// \param src0 486 /// The 1st source tile. Max size is 1024 Bytes. 487 /// \param src1 488 /// The 2nd source tile. Max size is 1024 Bytes. 489 __DEFAULT_FN_ATTRS_BF16 490 static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, 491 __tile1024i src1) { 492 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, 493 src0.tile, src1.tile); 494 } 495 496 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and 497 /// src1, accumulating the intermediate single-precision (32-bit) floating-point 498 /// elements with elements in "dst", and store the 32-bit result back to tile 499 /// "dst". 500 /// 501 /// \headerfile <immintrin.h> 502 /// 503 /// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction. 504 /// 505 /// \param dst 506 /// The destination tile. Max size is 1024 Bytes. 507 /// \param src0 508 /// The 1st source tile. Max size is 1024 Bytes. 509 /// \param src1 510 /// The 2nd source tile. Max size is 1024 Bytes. 511 __DEFAULT_FN_ATTRS_FP16 512 static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0, 513 __tile1024i src1) { 514 dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, 515 src0.tile, src1.tile); 516 } 517 518 #undef __DEFAULT_FN_ATTRS_TILE 519 #undef __DEFAULT_FN_ATTRS_INT8 520 #undef __DEFAULT_FN_ATTRS_BF16 521 #undef __DEFAULT_FN_ATTRS_FP16 522 523 #endif /* __x86_64__ */ 524 #endif /* __AMXINTRIN_H */ 525