1*bdd1243dSDimitry Andric /*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------=== 2*bdd1243dSDimitry Andric * 3*bdd1243dSDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*bdd1243dSDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 5*bdd1243dSDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*bdd1243dSDimitry Andric * 7*bdd1243dSDimitry Andric *===------------------------------------------------------------------------=== 8*bdd1243dSDimitry Andric */ 9*bdd1243dSDimitry Andric 10*bdd1243dSDimitry Andric #ifndef __IMMINTRIN_H 11*bdd1243dSDimitry Andric #error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead." 12*bdd1243dSDimitry Andric #endif /* __IMMINTRIN_H */ 13*bdd1243dSDimitry Andric 14*bdd1243dSDimitry Andric #ifndef __AMX_FP16INTRIN_H 15*bdd1243dSDimitry Andric #define __AMX_FP16INTRIN_H 16*bdd1243dSDimitry Andric #ifdef __x86_64__ 17*bdd1243dSDimitry Andric 18*bdd1243dSDimitry Andric /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a 19*bdd1243dSDimitry Andric /// and \a b, accumulating the intermediate single-precision (32-bit) 20*bdd1243dSDimitry Andric /// floating-point elements with elements in \a dst, and store the 32-bit 21*bdd1243dSDimitry Andric /// result back to tile \a dst. 22*bdd1243dSDimitry Andric /// 23*bdd1243dSDimitry Andric /// \headerfile <immintrin.h> 24*bdd1243dSDimitry Andric /// 25*bdd1243dSDimitry Andric /// \code 26*bdd1243dSDimitry Andric /// void _tile_dpfp16ps (__tile dst, __tile a, __tile b) 27*bdd1243dSDimitry Andric /// \endcode 28*bdd1243dSDimitry Andric /// 29*bdd1243dSDimitry Andric /// \code{.operation} 30*bdd1243dSDimitry Andric /// FOR m := 0 TO dst.rows - 1 31*bdd1243dSDimitry Andric /// tmp := dst.row[m] 32*bdd1243dSDimitry Andric /// FOR k := 0 TO (a.colsb / 4) - 1 33*bdd1243dSDimitry Andric /// FOR n := 0 TO (dst.colsb / 4) - 1 34*bdd1243dSDimitry Andric /// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * 35*bdd1243dSDimitry Andric /// FP32(b.row[k].fp16[2*n+0]) 36*bdd1243dSDimitry Andric /// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * 37*bdd1243dSDimitry Andric /// FP32(b.row[k].fp16[2*n+1]) 38*bdd1243dSDimitry Andric /// ENDFOR 39*bdd1243dSDimitry Andric /// ENDFOR 40*bdd1243dSDimitry Andric /// write_row_and_zero(dst, m, tmp, dst.colsb) 41*bdd1243dSDimitry Andric /// ENDFOR 42*bdd1243dSDimitry Andric /// zero_upper_rows(dst, dst.rows) 43*bdd1243dSDimitry Andric /// zero_tileconfig_start() 44*bdd1243dSDimitry Andric /// \endcode 45*bdd1243dSDimitry Andric /// 46*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c TDPFP16PS instruction. 47*bdd1243dSDimitry Andric /// 48*bdd1243dSDimitry Andric /// \param dst 49*bdd1243dSDimitry Andric /// The destination tile. Max size is 1024 Bytes. 50*bdd1243dSDimitry Andric /// \param a 51*bdd1243dSDimitry Andric /// The 1st source tile. Max size is 1024 Bytes. 52*bdd1243dSDimitry Andric /// \param b 53*bdd1243dSDimitry Andric /// The 2nd source tile. Max size is 1024 Bytes. 54*bdd1243dSDimitry Andric #define _tile_dpfp16ps(dst, a, b) \ 55*bdd1243dSDimitry Andric __builtin_ia32_tdpfp16ps(dst, a, b) 56*bdd1243dSDimitry Andric 57*bdd1243dSDimitry Andric #endif /* __x86_64__ */ 58*bdd1243dSDimitry Andric #endif /* __AMX_FP16INTRIN_H */ 59