1 /*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead." 12 #endif // __IMMINTRIN_H 13 14 #ifndef __AMX_COMPLEXINTRIN_H 15 #define __AMX_COMPLEXINTRIN_H 16 #ifdef __x86_64__ 17 18 #define __DEFAULT_FN_ATTRS_COMPLEX \ 19 __attribute__((__always_inline__, __nodebug__, __target__("amx-complex"))) 20 21 /// Perform matrix multiplication of two tiles containing complex elements and 22 /// accumulate the results into a packed single precision tile. Each dword 23 /// element in input tiles \a a and \a b is interpreted as a complex number 24 /// with FP16 real part and FP16 imaginary part. 25 /// Calculates the imaginary part of the result. For each possible combination 26 /// of (row of \a a, column of \a b), it performs a set of multiplication 27 /// and accumulations on all corresponding complex numbers (one from \a a 28 /// and one from \a b). The imaginary part of the \a a element is multiplied 29 /// with the real part of the corresponding \a b element, and the real part 30 /// of the \a a element is multiplied with the imaginary part of the 31 /// corresponding \a b elements. The two accumulated results are added, and 32 /// then accumulated into the corresponding row and column of \a dst. 33 /// 34 /// \headerfile <x86intrin.h> 35 /// 36 /// \code 37 /// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b); 38 /// \endcode 39 /// 40 /// \code{.operation} 41 /// FOR m := 0 TO dst.rows - 1 42 /// tmp := dst.row[m] 43 /// FOR k := 0 TO (a.colsb / 4) - 1 44 /// FOR n := 0 TO (dst.colsb / 4) - 1 45 /// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) 46 /// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) 47 /// ENDFOR 48 /// ENDFOR 49 /// write_row_and_zero(dst, m, tmp, dst.colsb) 50 /// ENDFOR 51 /// zero_upper_rows(dst, dst.rows) 52 /// zero_tileconfig_start() 53 /// \endcode 54 /// 55 /// This intrinsic corresponds to the \c TCMMIMFP16PS instruction. 56 /// 57 /// \param dst 58 /// The destination tile. Max size is 1024 Bytes. 59 /// \param a 60 /// The 1st source tile. Max size is 1024 Bytes. 61 /// \param b 62 /// The 2nd source tile. Max size is 1024 Bytes. 63 #define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b) 64 65 /// Perform matrix multiplication of two tiles containing complex elements and 66 /// accumulate the results into a packed single precision tile. Each dword 67 /// element in input tiles \a a and \a b is interpreted as a complex number 68 /// with FP16 real part and FP16 imaginary part. 69 /// Calculates the real part of the result. For each possible combination 70 /// of (row of \a a, column of \a b), it performs a set of multiplication 71 /// and accumulations on all corresponding complex numbers (one from \a a 72 /// and one from \a b). The real part of the \a a element is multiplied 73 /// with the real part of the corresponding \a b element, and the negated 74 /// imaginary part of the \a a element is multiplied with the imaginary 75 /// part of the corresponding \a b elements. The two accumulated results 76 /// are added, and then accumulated into the corresponding row and column 77 /// of \a dst. 78 /// 79 /// \headerfile <x86intrin.h> 80 /// 81 /// \code 82 /// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b); 83 /// \endcode 84 /// 85 /// \code{.operation} 86 /// FOR m := 0 TO dst.rows - 1 87 /// tmp := dst.row[m] 88 /// FOR k := 0 TO (a.colsb / 4) - 1 89 /// FOR n := 0 TO (dst.colsb / 4) - 1 90 /// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) 91 /// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) 92 /// ENDFOR 93 /// ENDFOR 94 /// write_row_and_zero(dst, m, tmp, dst.colsb) 95 /// ENDFOR 96 /// zero_upper_rows(dst, dst.rows) 97 /// zero_tileconfig_start() 98 /// \endcode 99 /// 100 /// This intrinsic corresponds to the \c TCMMIMFP16PS instruction. 101 /// 102 /// \param dst 103 /// The destination tile. Max size is 1024 Bytes. 104 /// \param a 105 /// The 1st source tile. Max size is 1024 Bytes. 106 /// \param b 107 /// The 2nd source tile. Max size is 1024 Bytes. 108 #define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b) 109 110 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX 111 _tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, 112 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 113 return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2); 114 } 115 116 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX 117 _tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, 118 _tile1024i dst, _tile1024i src1, _tile1024i src2) { 119 return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2); 120 } 121 122 /// Perform matrix multiplication of two tiles containing complex elements and 123 /// accumulate the results into a packed single precision tile. Each dword 124 /// element in input tiles src0 and src1 is interpreted as a complex number with 125 /// FP16 real part and FP16 imaginary part. 126 /// This function calculates the imaginary part of the result. 127 /// 128 /// \headerfile <immintrin.h> 129 /// 130 /// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction. 131 /// 132 /// \param dst 133 /// The destination tile. Max size is 1024 Bytes. 134 /// \param src0 135 /// The 1st source tile. Max size is 1024 Bytes. 136 /// \param src1 137 /// The 2nd source tile. Max size is 1024 Bytes. 138 __DEFAULT_FN_ATTRS_COMPLEX 139 static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, 140 __tile1024i src1) { 141 dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col, 142 dst->tile, src0.tile, src1.tile); 143 } 144 145 /// Perform matrix multiplication of two tiles containing complex elements and 146 /// accumulate the results into a packed single precision tile. Each dword 147 /// element in input tiles src0 and src1 is interpreted as a complex number with 148 /// FP16 real part and FP16 imaginary part. 149 /// This function calculates the real part of the result. 150 /// 151 /// \headerfile <immintrin.h> 152 /// 153 /// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction. 154 /// 155 /// \param dst 156 /// The destination tile. Max size is 1024 Bytes. 157 /// \param src0 158 /// The 1st source tile. Max size is 1024 Bytes. 159 /// \param src1 160 /// The 2nd source tile. Max size is 1024 Bytes. 161 __DEFAULT_FN_ATTRS_COMPLEX 162 static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, 163 __tile1024i src1) { 164 dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col, 165 dst->tile, src0.tile, src1.tile); 166 } 167 168 #endif // __x86_64__ 169 #endif // __AMX_COMPLEXINTRIN_H 170