xref: /freebsd/contrib/llvm-project/clang/lib/Headers/amxfp16intrin.h (revision bdd1243df58e60e85101c09001d9812a789b6bc4)
1*bdd1243dSDimitry Andric /*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
2*bdd1243dSDimitry Andric  *
3*bdd1243dSDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bdd1243dSDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
5*bdd1243dSDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bdd1243dSDimitry Andric  *
7*bdd1243dSDimitry Andric  *===------------------------------------------------------------------------===
8*bdd1243dSDimitry Andric  */
9*bdd1243dSDimitry Andric 
10*bdd1243dSDimitry Andric #ifndef __IMMINTRIN_H
11*bdd1243dSDimitry Andric #error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
12*bdd1243dSDimitry Andric #endif /* __IMMINTRIN_H */
13*bdd1243dSDimitry Andric 
14*bdd1243dSDimitry Andric #ifndef __AMX_FP16INTRIN_H
15*bdd1243dSDimitry Andric #define __AMX_FP16INTRIN_H
16*bdd1243dSDimitry Andric #ifdef __x86_64__
17*bdd1243dSDimitry Andric 
18*bdd1243dSDimitry Andric /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
19*bdd1243dSDimitry Andric ///    and \a b, accumulating the intermediate single-precision (32-bit)
20*bdd1243dSDimitry Andric ///    floating-point elements with elements in \a dst, and store the 32-bit
21*bdd1243dSDimitry Andric ///    result back to tile \a dst.
22*bdd1243dSDimitry Andric ///
23*bdd1243dSDimitry Andric /// \headerfile <immintrin.h>
24*bdd1243dSDimitry Andric ///
25*bdd1243dSDimitry Andric /// \code
26*bdd1243dSDimitry Andric /// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
27*bdd1243dSDimitry Andric /// \endcode
28*bdd1243dSDimitry Andric ///
29*bdd1243dSDimitry Andric /// \code{.operation}
30*bdd1243dSDimitry Andric /// FOR m := 0 TO dst.rows - 1
31*bdd1243dSDimitry Andric ///	tmp := dst.row[m]
32*bdd1243dSDimitry Andric ///	FOR k := 0 TO (a.colsb / 4) - 1
33*bdd1243dSDimitry Andric ///		FOR n := 0 TO (dst.colsb / 4) - 1
34*bdd1243dSDimitry Andric ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
35*bdd1243dSDimitry Andric ///					FP32(b.row[k].fp16[2*n+0])
36*bdd1243dSDimitry Andric ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
37*bdd1243dSDimitry Andric ///					FP32(b.row[k].fp16[2*n+1])
38*bdd1243dSDimitry Andric ///		ENDFOR
39*bdd1243dSDimitry Andric ///	ENDFOR
40*bdd1243dSDimitry Andric ///	write_row_and_zero(dst, m, tmp, dst.colsb)
41*bdd1243dSDimitry Andric /// ENDFOR
42*bdd1243dSDimitry Andric /// zero_upper_rows(dst, dst.rows)
43*bdd1243dSDimitry Andric /// zero_tileconfig_start()
44*bdd1243dSDimitry Andric /// \endcode
45*bdd1243dSDimitry Andric ///
46*bdd1243dSDimitry Andric /// This intrinsic corresponds to the \c TDPFP16PS instruction.
47*bdd1243dSDimitry Andric ///
48*bdd1243dSDimitry Andric /// \param dst
49*bdd1243dSDimitry Andric ///    The destination tile. Max size is 1024 Bytes.
50*bdd1243dSDimitry Andric /// \param a
51*bdd1243dSDimitry Andric ///    The 1st source tile. Max size is 1024 Bytes.
52*bdd1243dSDimitry Andric /// \param b
53*bdd1243dSDimitry Andric ///    The 2nd source tile. Max size is 1024 Bytes.
54*bdd1243dSDimitry Andric #define _tile_dpfp16ps(dst, a, b)                                \
55*bdd1243dSDimitry Andric   __builtin_ia32_tdpfp16ps(dst, a, b)
56*bdd1243dSDimitry Andric 
57*bdd1243dSDimitry Andric #endif /* __x86_64__ */
58*bdd1243dSDimitry Andric #endif /* __AMX_FP16INTRIN_H */
59