xref: /freebsd/contrib/llvm-project/clang/lib/Headers/amxtransposeintrin.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 /* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  * ===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
12 #endif /* __IMMINTRIN_H */
13 
14 #ifndef __AMX_TRANSPOSEINTRIN_H
15 #define __AMX_TRANSPOSEINTRIN_H
16 #ifdef __x86_64__
17 
18 #define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
19   __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
20 
21 #define _tile_2rpntlvwz0(tdst, base, stride)                                   \
22   __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
23 #define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
24   __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
25 #define _tile_2rpntlvwz1(tdst, base, stride)                                   \
26   __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
27 #define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
28   __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
29 
30 /// Transpose 32-bit elements from \a src and write the result to \a dst.
31 ///
32 /// \headerfile <immintrin.h>
33 ///
34 /// \code
35 /// void _tile_transposed(__tile dst, __tile src);
36 /// \endcode
37 ///
38 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
39 ///
40 /// \param dst
41 /// 	The destination tile. Max size is 1024 Bytes.
42 /// \param src
43 /// 	The source tile. Max size is 1024 Bytes.
44 ///
45 /// \code{.operation}
46 ///
47 /// FOR i := 0 TO (dst.rows-1)
48 /// 	tmp[511:0] := 0
49 /// 	FOR j := 0 TO (dst.colsb/4-1)
50 /// 		tmp.dword[j] := src.row[j].dword[i]
51 /// 	ENDFOR
52 /// 	dst.row[i] := tmp
53 /// ENDFOR
54 ///
55 /// zero_upper_rows(dst, dst.rows)
56 /// zero_tileconfig_start()
57 /// \endcode
58 #define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
59 
_tile_2rpntlvwz0_internal(unsigned short row,unsigned short col0,unsigned short col1,_tile1024i * dst0,_tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)60 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
61     unsigned short row, unsigned short col0, unsigned short col1,
62     _tile1024i *dst0, _tile1024i *dst1, const void *base,
63     __SIZE_TYPE__ stride) {
64   // Use __tile1024i_1024a* to escape the alignment check in
65   // clang/test/Headers/x86-intrinsics-headers-clean.cpp
66   __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
67                                       (_tile1024i_1024a *)dst1, base,
68                                       (__SIZE_TYPE__)(stride));
69 }
70 
_tile_2rpntlvwz0t1_internal(unsigned short row,unsigned short col0,unsigned short col1,_tile1024i * dst0,_tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)71 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
72     unsigned short row, unsigned short col0, unsigned short col1,
73     _tile1024i *dst0, _tile1024i *dst1, const void *base,
74     __SIZE_TYPE__ stride) {
75   __builtin_ia32_t2rpntlvwz0t1_internal(
76       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
77       (__SIZE_TYPE__)(stride));
78 }
79 
_tile_2rpntlvwz1_internal(unsigned short row,unsigned short col0,unsigned short col1,_tile1024i * dst0,_tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)80 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
81     unsigned short row, unsigned short col0, unsigned short col1,
82     _tile1024i *dst0, _tile1024i *dst1, const void *base,
83     __SIZE_TYPE__ stride) {
84   __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
85                                       (_tile1024i_1024a *)dst1, base,
86                                       (__SIZE_TYPE__)(stride));
87 }
88 
_tile_2rpntlvwz1t1_internal(unsigned short row,unsigned short col0,unsigned short col1,_tile1024i * dst0,_tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)89 static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
90     unsigned short row, unsigned short col0, unsigned short col1,
91     _tile1024i *dst0, _tile1024i *dst1, const void *base,
92     __SIZE_TYPE__ stride) {
93   __builtin_ia32_t2rpntlvwz1t1_internal(
94       row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
95       (__SIZE_TYPE__)(stride));
96 }
97 
98 // This is internal intrinsic. C/C++ user should avoid calling it directly.
99 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
_tile_transposed_internal(unsigned short m,unsigned short n,_tile1024i src)100 _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
101   return __builtin_ia32_ttransposed_internal(m, n, src);
102 }
103 
104 /// Converts a pair of tiles from memory into VNNI format, and places the
105 /// results in a pair of destinations specified by dst. The pair of tiles
106 /// in memory is specified via a tsib; the second tile is after the first
107 /// one, separated by the same stride that separates each row.
108 /// The tile configuration for the destination tiles indicates the amount
109 /// of data to read from memory. The instruction will load a number of rows
110 /// that is equal to twice the number of rows in tmm1. The size of each row
111 /// is equal to the average width of the destination tiles. If the second
112 /// tile is configured with zero rows and columns, only the first tile will
113 /// be written.
114 /// Provides a hint to the implementation that the data will likely not be
115 /// reused in the near future and the data caching can be optimized.
116 ///
117 /// \headerfile <immintrin.h>
118 ///
119 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
120 ///
121 /// \param dst0
122 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
123 /// \param dst1
124 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
125 /// \param base
126 ///    A pointer to base address.
127 /// \param stride
128 ///    The stride between the rows' data to be loaded in memory.
129 __DEFAULT_FN_ATTRS_TRANSPOSE
__tile_2rpntlvwz0(__tile1024i * dst0,__tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)130 static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
131                               const void *base, __SIZE_TYPE__ stride) {
132   _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
133                             &dst1->tile, base, stride);
134 }
135 
136 /// Converts a pair of tiles from memory into VNNI format, and places the
137 /// results in a pair of destinations specified by dst. The pair of tiles
138 /// in memory is specified via a tsib; the second tile is after the first
139 /// one, separated by the same stride that separates each row.
140 /// The tile configuration for the destination tiles indicates the amount
141 /// of data to read from memory. The instruction will load a number of rows
142 /// that is equal to twice the number of rows in tmm1. The size of each row
143 /// is equal to the average width of the destination tiles. If the second
144 /// tile is configured with zero rows and columns, only the first tile will
145 /// be written.
146 ///
147 /// \headerfile <immintrin.h>
148 ///
149 /// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
150 ///
151 /// \param dst0
152 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
153 /// \param dst1
154 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
155 /// \param base
156 ///    A pointer to base address.
157 /// \param stride
158 ///    The stride between the rows' data to be loaded in memory.
159 __DEFAULT_FN_ATTRS_TRANSPOSE
__tile_2rpntlvwz0t1(__tile1024i * dst0,__tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)160 static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
161                                 const void *base, __SIZE_TYPE__ stride) {
162   _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
163                               &dst1->tile, base, stride);
164 }
165 
166 /// Converts a pair of tiles from memory into VNNI format, and places the
167 /// results in a pair of destinations specified by dst. The pair of tiles
168 /// in memory is specified via a tsib; the second tile is after the first
169 /// one, separated by the same stride that separates each row.
170 /// The tile configuration for the destination tiles indicates the amount
171 /// of data to read from memory. The instruction will load a number of rows
172 /// that is equal to twice the number of rows in tmm1. The size of each row
173 /// is equal to the average width of the destination tiles. If the second
174 /// tile is configured with zero rows and columns, only the first tile will
175 /// be written. The last row will be not be read from memory but instead
176 /// filled with zeros.
177 /// Provides a hint to the implementation that the data will likely not be
178 /// reused in the near future and the data caching can be optimized.
179 ///
180 /// \headerfile <immintrin.h>
181 ///
182 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
183 ///
184 /// \param dst0
185 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
186 /// \param dst1
187 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
188 /// \param base
189 ///    A pointer to base address.
190 /// \param stride
191 ///    The stride between the rows' data to be loaded in memory.
192 __DEFAULT_FN_ATTRS_TRANSPOSE
__tile_2rpntlvwz1(__tile1024i * dst0,__tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)193 static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
194                               const void *base, __SIZE_TYPE__ stride) {
195   _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
196                             &dst1->tile, base, stride);
197 }
198 
199 /// Converts a pair of tiles from memory into VNNI format, and places the
200 /// results in a pair of destinations specified by dst. The pair of tiles
201 /// in memory is specified via a tsib; the second tile is after the first
202 /// one, separated by the same stride that separates each row.
203 /// The tile configuration for the destination tiles indicates the amount
204 /// of data to read from memory. The instruction will load a number of rows
205 /// that is equal to twice the number of rows in tmm1. The size of each row
206 /// is equal to the average width of the destination tiles. If the second
207 /// tile is configured with zero rows and columns, only the first tile will
208 /// be written. The last row will be not be read from memory but instead
209 /// filled with zeros.
210 /// Provides a hint to the implementation that the data will likely not be
211 /// reused in the near future and the data caching can be optimized.
212 ///
213 /// \headerfile <immintrin.h>
214 ///
215 /// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
216 ///
217 /// \param dst0
218 ///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
219 /// \param dst1
220 ///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
221 /// \param base
222 ///    A pointer to base address.
223 /// \param stride
224 ///    The stride between the rows' data to be loaded in memory.
225 __DEFAULT_FN_ATTRS_TRANSPOSE
__tile_2rpntlvwz1t1(__tile1024i * dst0,__tile1024i * dst1,const void * base,__SIZE_TYPE__ stride)226 static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
227                                 const void *base, __SIZE_TYPE__ stride) {
228   _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
229                               &dst1->tile, base, stride);
230 }
231 
232 /// Transpose 32-bit elements from src and write the result to dst.
233 ///
234 /// \headerfile <immintrin.h>
235 ///
236 /// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
237 ///
238 /// \param dst
239 ///    The destination tile. Max size is 1024 Bytes.
240 /// \param src
241 ///    The source tile. Max size is 1024 Bytes.
242 __DEFAULT_FN_ATTRS_TRANSPOSE
__tile_transposed(__tile1024i * dst,__tile1024i src)243 static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
244   dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
245 }
246 
247 #endif /* __x86_64__ */
248 #endif /* __AMX_TRANSPOSEINTRIN_H */
249