xref: /freebsd/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines all of the R600-specific intrinsics.
10//
11//===----------------------------------------------------------------------===//
12
13def global_ptr_ty : LLVMQualPointerType<1>;
14
15// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
16// by the backend cause whole-program undefined behavior when violated, such as
17// by causing all other preload register intrinsics to return arbitrarily incorrect
18// values. In non-entry-point functions, attempting to call a function that needs
19// some preloaded register from a function that is known to not need it is a violation
20// of the calling convention and also program-level UB. Outside of such IR-level UB,
21// these preloaded registers are always set to a well-defined value and are thus `noundef`.
22class AMDGPUReadPreloadRegisterIntrinsic
23  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
24
25class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
26  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
27
28// Used to tag image and resource intrinsics with information used to generate
29// mem operands.
30class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
31  int RsrcArg = rsrcarg;
32  bit IsImage = isimage;
33}
34
35let TargetPrefix = "r600" in {
36
37multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
38  def _x : AMDGPUReadPreloadRegisterIntrinsic;
39  def _y : AMDGPUReadPreloadRegisterIntrinsic;
40  def _z : AMDGPUReadPreloadRegisterIntrinsic;
41}
42
43multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
44  def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
45  def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
46  def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
47}
48
49defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
50                                 <"__builtin_r600_read_global_size">;
51defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
52                             <"__builtin_r600_read_ngroups">;
53defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
54                          <"__builtin_r600_read_tgid">;
55
56defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
57defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
58
59def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
60  Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
61
62// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
63def int_r600_implicitarg_ptr :
64  ClangBuiltin<"__builtin_r600_implicitarg_ptr">,
65  DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [],
66  [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
67
68def int_r600_rat_store_typed :
69  // 1st parameter: Data
70  // 2nd parameter: Index
71  // 3rd parameter: Constant RAT ID
72  DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>,
73  ClangBuiltin<"__builtin_r600_rat_store_typed">;
74
75def int_r600_recipsqrt_ieee :  DefaultAttrsIntrinsic<
76  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
77>;
78
79def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic<
80  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
81>;
82
83def int_r600_cube : DefaultAttrsIntrinsic<
84  [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
85>;
86
87def int_r600_store_stream_output : DefaultAttrsIntrinsic<
88  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
89>;
90
91class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [
92  llvm_v4f32_ty, // Coord
93  llvm_i32_ty,   // offset_x
94  llvm_i32_ty,   // offset_y,
95  llvm_i32_ty,   // offset_z,
96  llvm_i32_ty,   // resource_id
97  llvm_i32_ty,   // samplerid
98  llvm_i32_ty,   // coord_type_x
99  llvm_i32_ty,   // coord_type_y
100  llvm_i32_ty,   // coord_type_z
101  llvm_i32_ty],  // coord_type_w
102  [IntrNoMem]
103>;
104
105class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [
106    llvm_v4i32_ty, // Coord
107    llvm_i32_ty,   // offset_x
108    llvm_i32_ty,   // offset_y,
109    llvm_i32_ty,   // offset_z,
110    llvm_i32_ty,   // resource_id
111    llvm_i32_ty,   // samplerid
112    llvm_i32_ty,   // coord_type_x
113    llvm_i32_ty,   // coord_type_y
114    llvm_i32_ty,   // coord_type_z
115    llvm_i32_ty],  // coord_type_w
116    [IntrNoMem]
117>;
118
119def int_r600_store_swizzle :
120  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
121>;
122
123def int_r600_tex : TextureIntrinsicFloatInput;
124def int_r600_texc : TextureIntrinsicFloatInput;
125def int_r600_txl : TextureIntrinsicFloatInput;
126def int_r600_txlc : TextureIntrinsicFloatInput;
127def int_r600_txb : TextureIntrinsicFloatInput;
128def int_r600_txbc : TextureIntrinsicFloatInput;
129def int_r600_txf : TextureIntrinsicInt32Input;
130def int_r600_txq : TextureIntrinsicInt32Input;
131def int_r600_ddx : TextureIntrinsicFloatInput;
132def int_r600_ddy : TextureIntrinsicFloatInput;
133
134def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty],
135  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
136>;
137
138def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>;
139
140} // End TargetPrefix = "r600"
141
142let TargetPrefix = "amdgcn" in {
143
144//===----------------------------------------------------------------------===//
145// ABI Special Intrinsics
146//===----------------------------------------------------------------------===//
147
148defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
149defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
150                               <"__builtin_amdgcn_workgroup_id">;
151
152def int_amdgcn_dispatch_ptr :
153  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
154  [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
155
156def int_amdgcn_queue_ptr :
157  ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
158  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
159  [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>;
160
161def int_amdgcn_kernarg_segment_ptr :
162  ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
163  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
164  [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
165
166def int_amdgcn_implicitarg_ptr :
167  ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
168  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
169  [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
170
171// Returns the amount of LDS statically allocated for this program.
172// This is no longer guaranteed to be a compile-time constant due to linking
173// support.
174def int_amdgcn_groupstaticsize :
175  ClangBuiltin<"__builtin_amdgcn_groupstaticsize">,
176  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
177
178def int_amdgcn_dispatch_id :
179  ClangBuiltin<"__builtin_amdgcn_dispatch_id">,
180  DefaultAttrsIntrinsic<[llvm_i64_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
181
182// For internal use. Coordinates LDS lowering between IR transform and backend.
183def int_amdgcn_lds_kernel_id :
184  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
185
186def int_amdgcn_implicit_buffer_ptr :
187  ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
188  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
189  [Align<RetIndex, 4>, NoUndef<RetIndex>,
190    IntrNoMem, IntrSpeculatable]>;
191
192// Set EXEC to the 64-bit value given.
193// This is always moved to the beginning of the basic block.
194// FIXME: Should be mangled for wave size.
195def int_amdgcn_init_exec : Intrinsic<[],
196  [llvm_i64_ty],      // 64-bit literal constant
197  [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback,
198   IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>;
199
200// Set EXEC according to a thread count packed in an SGPR input:
201//    thread_count = (input >> bitoffset) & 0x7f;
202// This is always moved to the beginning of the basic block.
203// Note: only inreg arguments to the parent function are valid as
204// inputs to this intrinsic, computed values cannot be used.
205def int_amdgcn_init_exec_from_input : Intrinsic<[],
206  [llvm_i32_ty,       // 32-bit SGPR input
207   llvm_i32_ty],      // bit offset of the thread count
208  [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
209   IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
210
211def int_amdgcn_wavefrontsize :
212  ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
213  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
214
215// Represent a relocation constant.
216def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic<
217  [llvm_i32_ty], [llvm_metadata_ty],
218  [IntrNoMem, IntrSpeculatable]
219>;
220
221//===----------------------------------------------------------------------===//
222// Instruction Intrinsics
223//===----------------------------------------------------------------------===//
224
225// The first parameter is s_sendmsg immediate (i16),
226// the second one is copied to m0
227def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
228  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
229  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
230def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
231  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
232  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
233
234
235// gfx11 intrinsic
236// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
237def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
238  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
239
240def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
241  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
242
243def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
244  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
245                                IntrNoCallback, IntrNoFree]>;
246
247def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
248  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
249                                IntrNoCallback, IntrNoFree]>;
250
251def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
252  Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
253                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
254
255def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
256  Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
257                                IntrNoCallback, IntrNoFree]>;
258
259def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
260  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
261                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
262
263def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
264  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
265                                IntrNoCallback, IntrNoFree]>;
266
267def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
268  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
269                                IntrNoCallback, IntrNoFree]>;
270
271def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
272  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
273                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
274
275def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
276  Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
277
278def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
279  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
280                                IntrNoCallback, IntrNoFree]>;
281
282def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
283  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
284
285// The 1st parameter is a mask for the types of instructions that may be allowed
286// to cross the SCHED_BARRIER during scheduling.
287//     MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
288//     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
289//                         scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
290//     MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
291//     MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
292//     MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER.
293//     MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
294//     MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
295//     MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
296//     MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
297//     MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
298//     MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
299def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
300  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
301                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
302
303// The first parameter is a mask that determines the types of instructions that
304// you would like to synchronize around and add to a scheduling group. The
305// values of the mask are defined above for sched_barrier. These instructions
306// will be selected from the bottom up starting from the sched_group_barrier's
307// location during instruction scheduling. The second parameter is the number of
308// matching instructions that will be associated with this sched_group_barrier.
309// The third parameter is an identifier which is used to describe what other
310// sched_group_barriers should be synchronized with.
311def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
312  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
313  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
314   IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
315
316// Scheduler optimization hint.
317//     MASK = 0: Small gemm opt
318def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
319  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
320                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
321
322def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
323  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
324
325// GFX12 intrinsics
326class AMDGPUWaitIntrinsic :
327  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
328def int_amdgcn_s_wait_bvhcnt         : AMDGPUWaitIntrinsic;
329def int_amdgcn_s_wait_dscnt          : AMDGPUWaitIntrinsic;
330def int_amdgcn_s_wait_expcnt         : AMDGPUWaitIntrinsic;
331def int_amdgcn_s_wait_kmcnt          : AMDGPUWaitIntrinsic;
332def int_amdgcn_s_wait_loadcnt        : AMDGPUWaitIntrinsic;
333def int_amdgcn_s_wait_samplecnt      : AMDGPUWaitIntrinsic;
334def int_amdgcn_s_wait_storecnt       : AMDGPUWaitIntrinsic;
335
336def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
337  // 1st parameter: Numerator
338  // 2nd parameter: Denominator
339  // 3rd parameter: Select quotient. Must equal Numerator or Denominator.
340  //                (0 = Denominator, 1 = Numerator).
341  [llvm_anyfloat_ty, llvm_i1_ty],
342  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
343  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
344>;
345
346def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
347  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
348  [IntrNoMem, IntrSpeculatable]
349>;
350
351def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
352  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
353  [IntrNoMem, IntrSpeculatable]
354>;
355
356// Look Up 2.0 / pi src0 with segment select src1[4:0]
357def int_amdgcn_trig_preop : DefaultAttrsIntrinsic<
358  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
359  [IntrNoMem, IntrSpeculatable]
360>;
361
362def int_amdgcn_sin : DefaultAttrsIntrinsic<
363  [llvm_anyfloat_ty], [LLVMMatchType<0>],
364  [IntrNoMem, IntrSpeculatable]
365>;
366
367def int_amdgcn_cos : DefaultAttrsIntrinsic<
368  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
369>;
370
371// v_log_{f16|f32}, performs log2. f32 version does not handle
372// denormals. There is no reason to use this for f16 as it does
373// support denormals, and the generic log2 intrinsic should be
374// preferred.
375def int_amdgcn_log : DefaultAttrsIntrinsic<
376  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
377>;
378
379// v_exp_{f16|f32} (int_amdgcn_exp was taken by export
380// already). Performs exp2. f32 version does not handle
381// denormals. There is no reason to use this for f16 as it does
382// support denormals, and the generic exp2 intrinsic should be
383// preferred.
384def int_amdgcn_exp2 : DefaultAttrsIntrinsic<
385  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
386>;
387
388def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
389  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
390>;
391
392def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
393  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
394  [IntrNoMem, IntrSpeculatable, Commutative]
395>;
396
397// Fused single-precision multiply-add with legacy behaviour for the multiply,
398// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
399// intended for use on subtargets that have the v_fma_legacy_f32 and/or
400// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
401// has a completely different kind of legacy behaviour.)
402def int_amdgcn_fma_legacy :
403  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
404  [IntrNoMem, IntrSpeculatable, Commutative]
405>;
406
407def int_amdgcn_rcp : DefaultAttrsIntrinsic<
408  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
409>;
410
411def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
412  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty],
413  [IntrNoMem, IntrSpeculatable]
414>;
415
416def int_amdgcn_sqrt :  DefaultAttrsIntrinsic<
417  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
418>;
419
420def int_amdgcn_rsq :  DefaultAttrsIntrinsic<
421  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
422>;
423
424def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
425  DefaultAttrsIntrinsic<
426  [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
427>;
428
429// out = 1.0 / sqrt(a) result clamped to +/- max_float.
430def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic<
431  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
432
433def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic<
434  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
435>;
436
437def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic<
438  [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
439>;
440
441// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
442// and always uses rtz, so is not suitable for implementing the OpenCL
443// fract function. It should be ok on VI.
444def int_amdgcn_fract : DefaultAttrsIntrinsic<
445  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
446>;
447
448def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
449  DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
450            [IntrNoMem, IntrSpeculatable]
451>;
452
453def int_amdgcn_cvt_pknorm_i16 :
454  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
455  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
456            [IntrNoMem, IntrSpeculatable]
457>;
458
459def int_amdgcn_cvt_pknorm_u16 :
460  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
461  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
462            [IntrNoMem, IntrSpeculatable]
463>;
464
465def int_amdgcn_cvt_pk_i16 :
466    ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
467    DefaultAttrsIntrinsic<
468  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
469  [IntrNoMem, IntrSpeculatable]
470>;
471
472def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
473  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
474    [IntrNoMem, IntrSpeculatable]
475>;
476
477def int_amdgcn_class : DefaultAttrsIntrinsic<
478  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
479  [IntrNoMem, IntrSpeculatable]
480>;
481
482def int_amdgcn_fmed3 :
483  DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
484    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
485    [IntrNoMem, IntrSpeculatable]
486>;
487
488def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
489  DefaultAttrsIntrinsic<[llvm_float_ty],
490    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
491    [IntrNoMem, IntrSpeculatable]
492>;
493
494def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
495  DefaultAttrsIntrinsic<[llvm_float_ty],
496  [llvm_float_ty, llvm_float_ty, llvm_float_ty],
497  [IntrNoMem, IntrSpeculatable]
498>;
499
500def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
501  DefaultAttrsIntrinsic<[llvm_float_ty],
502    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
503    [IntrNoMem, IntrSpeculatable]
504>;
505
506def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
507  DefaultAttrsIntrinsic<[llvm_float_ty],
508    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
509    [IntrNoMem, IntrSpeculatable]
510>;
511
512// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
513// should be used.
514def int_amdgcn_sffbh :
515  DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
516  [IntrNoMem, IntrSpeculatable]
517>;
518
519// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
520def int_amdgcn_fmad_ftz :
521  DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
522            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
523            [IntrNoMem, IntrSpeculatable]
524>;
525
526// FIXME: The m0 argument should be moved after the normal arguments
527class AMDGPUDSOrderedIntrinsic : Intrinsic<
528  [llvm_i32_ty],
529  // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that
530  // the bit packing can be optimized at the IR level.
531  [LLVMQualPointerType<2>, // IntToPtr(M0)
532   llvm_i32_ty, // value to add or swap
533   llvm_i32_ty, // ordering
534   llvm_i32_ty, // scope
535   llvm_i1_ty,  // isVolatile
536   llvm_i32_ty, // ordered count index (OA index), also added to the address
537                // gfx10: bits 24-27 indicate the number of active threads/dwords
538   llvm_i1_ty,  // wave release, usually set to 1
539   llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
540  [IntrWillReturn, NoCapture<ArgIndex<0>>,
541   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
542   ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree
543  ]
544>;
545
546class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
547  [llvm_i32_ty],
548  [llvm_anyptr_ty, // LDS or GDS ptr
549   llvm_i1_ty], // isVolatile
550   [IntrConvergent, IntrWillReturn, IntrArgMemOnly,
551    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
552   "",
553   [SDNPMemOperand]
554>;
555
556def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
557def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
558
559// The pointer argument is assumed to be dynamically uniform if a VGPR.
560def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
561def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
562
563} // TargetPrefix = "amdgcn"
564
565// New-style image intrinsics
566
567//////////////////////////////////////////////////////////////////////////
568// Dimension-aware image intrinsics framework
569//////////////////////////////////////////////////////////////////////////
570
571// Helper class to represent (type, name) combinations of arguments. The
572// argument names are explanatory and used as DAG operand names for codegen
573// pattern matching.
574class AMDGPUArg<LLVMType ty, string name> {
575  LLVMType Type = ty;
576  string Name = name;
577}
578
579// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
580class makeArgList<list<string> names, LLVMType basety> {
581  list<AMDGPUArg> ret =
582    !listconcat([AMDGPUArg<basety, names[0]>],
583                !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
584}
585
586// Return arglist, with LLVMMatchType's references shifted by 'shift'.
587class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
588  list<AMDGPUArg> ret =
589    !foreach(arg, arglist,
590             !if(!isa<LLVMMatchType>(arg.Type),
591                 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
592                           arg.Name>,
593                 arg));
594}
595
596// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
597// accordingly, and shifted by an additional 'shift'.
598class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
599  list<AMDGPUArg> ret =
600    !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
601           !listconcat(
602             lhs,
603             arglistmatchshift<rhs,
604                               !add(shift, !foldl(0, lhs, a, b,
605                                                  !add(a, b.Type.isAny)))>.ret));
606}
607
608// Represent texture/image types / dimensionality.
609class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix,
610                     list<string> coord_names, list<string> slice_names,
611                     bit msaa = 0> {
612  AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
613  string Name = name; // e.g. "2darraymsaa"
614  string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
615  bits<3> Encoding = enc;
616  bit DA = 0; // DA bit in MIMG encoding
617  bit MSAA = msaa;
618
619  list<AMDGPUArg> CoordSliceArgs =
620    makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
621  list<AMDGPUArg> CoordSliceIntArgs =
622    makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
623  list<AMDGPUArg> GradientArgs =
624    makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
625                            !foreach(name, coord_names, "d" # name # "dv")),
626                llvm_anyfloat_ty>.ret;
627
628  bits<8> NumCoords = !size(CoordSliceArgs);
629  bits<8> NumGradients = !size(GradientArgs);
630}
631
632def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>;
633def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
634def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>;
635let DA = 1 in {
636  def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>;
637  def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>;
638  def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
639}
640def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
641let DA = 1 in {
642  def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
643}
644
645def AMDGPUDims {
646  list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
647                                 AMDGPUDimCube, AMDGPUDim1DArray,
648                                 AMDGPUDim2DArray];
649  list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
650  list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
651}
652
653// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
654class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
655  string UpperCaseMod = ucmod;
656  string LowerCaseMod = lcmod;
657
658  // {offset} {bias} {z-compare}
659  list<AMDGPUArg> ExtraAddrArgs = extra_addr;
660  bit Offset = false;
661  bit Bias = false;
662  bit ZCompare = false;
663  bit Gradients = false;
664
665  // Name of the {lod} or {clamp} argument that is appended to the coordinates,
666  // if any.
667  string LodOrClamp = "";
668}
669
670// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
671// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
672defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
673  multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
674                                       list<AMDGPUArg> extra_addr> {
675    def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
676    let Offset = true in
677    def NAME#lcmod#_o : AMDGPUSampleVariant<
678        ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
679  }
680
681  multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
682                                        list<AMDGPUArg> extra_addr> {
683    defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
684    let ZCompare = true in
685    defm NAME : AMDGPUSampleHelper_Offset<
686        "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
687  }
688
689  multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
690                                      list<AMDGPUArg> extra_addr> {
691    defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
692    let LodOrClamp = "clamp" in
693    defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
694  }
695
696  defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
697    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
698    let Bias = true in
699    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
700        "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
701    let LodOrClamp = "lod" in
702    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
703    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
704  }
705
706  let Gradients = true in {
707    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
708    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
709  }
710}
711
712// Helper class to capture the profile of a dimension-aware image intrinsic.
713// This information is used to generate the intrinsic's type and to inform
714// codegen pattern matching.
715class AMDGPUDimProfile<string opmod,
716                       AMDGPUDimProps dim> {
717  AMDGPUDimProps Dim = dim;
718  string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
719
720  // These are intended to be overwritten by subclasses
721  bit IsSample = false;
722  bit IsAtomic = false;
723  list<LLVMType> RetTypes = [];
724  list<AMDGPUArg> DataArgs = [];
725  list<AMDGPUArg> ExtraAddrArgs = [];
726  bit Offset = false;
727  bit Bias = false;
728  bit ZCompare = false;
729  bit Gradients = false;
730  string LodClampMip = "";
731
732  int NumRetAndDataAnyTypes =
733    !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
734           !add(a, b.isAny));
735
736  list<AMDGPUArg> AddrArgs =
737    arglistconcat<[ExtraAddrArgs,
738                   !if(Gradients, dim.GradientArgs, []),
739                   !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
740                               !if(!empty(LodClampMip),
741                                   []<AMDGPUArg>,
742                                   [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
743                  NumRetAndDataAnyTypes>.ret;
744  list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
745  list<AMDGPUArg> AddrDefaultArgs =
746    !foreach(arg, AddrArgs,
747             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
748                           !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
749                       arg.Name>);
750  list<AMDGPUArg> AddrA16Args =
751    !foreach(arg, AddrArgs,
752             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
753                           !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
754                       arg.Name>);
755}
756
757class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
758  let IsSample = base.IsSample;
759  let IsAtomic = base.IsAtomic;
760  let RetTypes = base.RetTypes;
761  let DataArgs = base.DataArgs;
762  let ExtraAddrArgs = base.ExtraAddrArgs;
763  let Offset = base.Offset;
764  let Bias = base.Bias;
765  let ZCompare = base.ZCompare;
766  let Gradients = base.Gradients;
767  let LodClampMip = base.LodClampMip;
768}
769
770class AMDGPUDimSampleProfile<string opmod,
771                             AMDGPUDimProps dim,
772                             AMDGPUSampleVariant sample,
773                             bit has_return = true> : AMDGPUDimProfile<opmod, dim> {
774  let IsSample = true;
775  let RetTypes = !if(has_return, [llvm_any_ty], []);
776  let ExtraAddrArgs = sample.ExtraAddrArgs;
777  let Offset = sample.Offset;
778  let Bias = sample.Bias;
779  let ZCompare = sample.ZCompare;
780  let Gradients = sample.Gradients;
781  let LodClampMip = sample.LodOrClamp;
782}
783
784class AMDGPUDimSampleNoReturnProfile<string opmod,
785                             AMDGPUDimProps dim,
786                             AMDGPUSampleVariant sample>
787    : AMDGPUDimSampleProfile<opmod, dim, sample, false> {
788}
789
790class AMDGPUDimNoSampleProfile<string opmod,
791                               AMDGPUDimProps dim,
792                               list<LLVMType> retty,
793                               list<AMDGPUArg> dataargs,
794                               bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
795  let RetTypes = retty;
796  let DataArgs = dataargs;
797  let LodClampMip = !if(Mip, "mip", "");
798}
799
800class AMDGPUDimAtomicProfile<string opmod,
801                             AMDGPUDimProps dim,
802                             list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
803  let RetTypes = [llvm_anyint_ty];
804  let DataArgs = dataargs;
805  let IsAtomic = true;
806}
807
808class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
809                                  list<AMDGPUArg> dataargs>
810    : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
811  let RetTypes = [llvm_anyfloat_ty];
812}
813
814class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
815    : AMDGPUDimProfile<"GET_RESINFO", dim> {
816  let RetTypes = [llvm_anyfloat_ty];
817  let DataArgs = [];
818  let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
819  let LodClampMip = "mip";
820}
821
822// Helper class for figuring out image intrinsic argument indexes.
823class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
824  int NumDataArgs = !size(P_.DataArgs);
825  int NumDmaskArgs = !not(P_.IsAtomic);
826  int NumOffsetArgs = !if(P_.Offset, 1, 0);
827  int NumBiasArgs = !if(P_.Bias, 1, 0);
828  int NumZCompareArgs = !if(P_.ZCompare, 1, 0);
829  int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs);
830  int NumVAddrArgs = !size(P_.AddrArgs);
831  int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
832  int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
833  int NumRSrcArgs = 1;
834  int NumSampArgs = !if(P_.IsSample, 2, 0);
835  int DmaskArgIndex = NumDataArgs;
836  int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
837  int OffsetArgIndex = VAddrArgIndex;
838  int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs);
839  int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs);
840  int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
841  int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
842  int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
843  int MipArgIndex = LodArgIndex;
844  int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs);
845  int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs);
846  int UnormArgIndex = !add(SampArgIndex, 1);
847  int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
848  int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
849}
850
851// All dimension-aware intrinsics are derived from this class.
852class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
853                              list<IntrinsicProperty> props,
854                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
855    P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
856    !listconcat(
857      !foreach(arg, P_.DataArgs, arg.Type),    // vdata(VGPR) -- for store/atomic
858      !if(P_.IsAtomic, [], [llvm_i32_ty]),     // dmask(imm)
859      P_.AddrTypes,                            // vaddr(VGPR)
860      [llvm_v8i32_ty],                         // rsrc(SGPR)
861      !if(P_.IsSample, [llvm_v4i32_ty,         // samp(SGPR)
862                        llvm_i1_ty], []),      // unorm(imm)
863      [llvm_i32_ty,                            // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
864       llvm_i32_ty]),                          // auxiliary/cachepolicy(imm):
865                                               //                bit 0 = glc, bit 1 = slc,
866                                               //                bit 2 = dlc (gfx10/gfx11),
867                                               //                bit 4 = scc (gfx90a)
868                                               //        gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1
869                                               //        gfx12+: bits [0-2] = th, bits [3-4] = scope
870     !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
871          !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
872          !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
873          [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
874           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
875          !if(P_.IsAtomic, [], [IntrNoSync])),
876
877
878      "", sdnodeprops>,
879  AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
880                           !if(P_.IsAtomic, 0, 1)), 1> {
881  AMDGPUDimProfile P = P_;
882
883  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
884
885  let TargetPrefix = "amdgcn";
886}
887
888// Marker class for intrinsics with a DMask that determines the returned
889// channels.
890class AMDGPUImageDMaskIntrinsic;
891
892defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
893
894  //////////////////////////////////////////////////////////////////////////
895  // Load and store intrinsics
896  //////////////////////////////////////////////////////////////////////////
897  multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
898                                            list<LLVMType> retty,
899                                            list<AMDGPUArg> dataargs,
900                                            list<IntrinsicProperty> props,
901                                            list<SDNodeProperty> sdnodeprops,
902                                            bit Mip = false> {
903    foreach dim = AMDGPUDims.NoMsaa in {
904      def !strconcat(NAME, "_", dim.Name)
905        : AMDGPUImageDimIntrinsic<
906            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
907            props, sdnodeprops>;
908    }
909  }
910
911  multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
912                                         list<LLVMType> retty,
913                                         list<AMDGPUArg> dataargs,
914                                         list<IntrinsicProperty> props,
915                                         list<SDNodeProperty> sdnodeprops,
916                                         bit Mip = false> {
917    foreach dim = AMDGPUDims.All in {
918      def !strconcat(NAME, "_", dim.Name)
919        : AMDGPUImageDimIntrinsic<
920            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
921            props, sdnodeprops>;
922    }
923  }
924
925  defm int_amdgcn_image_load
926    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
927                                  [SDNPMemOperand]>,
928      AMDGPUImageDMaskIntrinsic;
929  defm int_amdgcn_image_load_mip
930    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
931                                     [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
932      AMDGPUImageDMaskIntrinsic;
933
934  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
935              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
936              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>,
937              AMDGPUImageDMaskIntrinsic;
938  defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
939              "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
940              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>,
941              AMDGPUImageDMaskIntrinsic;
942
943  //////////////////////////////////////////////////////////////////////////
944  // MSAA intrinsics
945  //////////////////////////////////////////////////////////////////////////
946  foreach dim = AMDGPUDims.Msaa in {
947    def int_amdgcn_image_msaa_load_x # _ # dim.Name:
948        AMDGPUImageDimIntrinsic<
949            AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>,
950            [IntrReadMem], [SDNPMemOperand]>;
951  }
952
953  foreach dim = AMDGPUDims.Msaa in {
954    def int_amdgcn_image_msaa_load # _ # dim.Name:
955        AMDGPUImageDimIntrinsic<
956            AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>,
957            [IntrReadMem], [SDNPMemOperand]>;
958  }
959
960  //////////////////////////////////////////////////////////////////////////
961  // sample and getlod intrinsics
962  //////////////////////////////////////////////////////////////////////////
963  multiclass AMDGPUImageDimSampleDims<string opmod,
964                                      AMDGPUSampleVariant sample,
965                                      bit NoMem = false> {
966    foreach dim = AMDGPUDims.NoMsaa in {
967      def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
968          AMDGPUDimSampleProfile<opmod, dim, sample>,
969          !if(NoMem, [IntrNoMem], [IntrReadMem]),
970          !if(NoMem, [], [SDNPMemOperand])>;
971    }
972  }
973
974  foreach sample = AMDGPUSampleVariants in {
975    defm int_amdgcn_image_sample # sample.LowerCaseMod
976      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
977        AMDGPUImageDMaskIntrinsic;
978  }
979
980  multiclass AMDGPUImageDimSampleNoReturnDims<string opmod,
981                                      AMDGPUSampleVariant sample> {
982    foreach dim = AMDGPUDims.NoMsaa in {
983      def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic<
984          AMDGPUDimSampleNoReturnProfile<opmod, dim, sample>,
985          [IntrWillReturn], [SDNPMemOperand]>;
986    }
987  }
988  foreach sample = AMDGPUSampleVariants in {
989    defm int_amdgcn_image_sample # sample.LowerCaseMod
990      : AMDGPUImageDimSampleNoReturnDims<
991        "SAMPLE" # sample.UpperCaseMod # "_nortn", sample>,
992        AMDGPUImageDMaskIntrinsic;
993  }
994
995  defm int_amdgcn_image_getlod
996    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
997      AMDGPUImageDMaskIntrinsic;
998
999  //////////////////////////////////////////////////////////////////////////
1000  // getresinfo intrinsics
1001  //////////////////////////////////////////////////////////////////////////
1002  foreach dim = AMDGPUDims.All in {
1003    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
1004      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
1005        AMDGPUImageDMaskIntrinsic;
1006  }
1007
1008  //////////////////////////////////////////////////////////////////////////
1009  // gather4 intrinsics
1010  //////////////////////////////////////////////////////////////////////////
1011  foreach sample = AMDGPUSampleVariantsNoGradients in {
1012    foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
1013      def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
1014          AMDGPUImageDimIntrinsic<
1015              AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
1016              [IntrReadMem], [SDNPMemOperand]>;
1017    }
1018  }
1019}
1020
1021//////////////////////////////////////////////////////////////////////////
1022// atomic intrinsics
1023//////////////////////////////////////////////////////////////////////////
1024defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
1025  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
1026                                   int isFloat = 0> {
1027        foreach dim = AMDGPUDims.All in {
1028          def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
1029              !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
1030                   AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
1031              [], [SDNPMemOperand]>;
1032        }
1033  }
1034
1035  multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
1036    defm ""
1037        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
1038                                isFloat>;
1039  }
1040
1041  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
1042    defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
1043  }
1044
1045  defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
1046  defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
1047  defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
1048  defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
1049  defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
1050  defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
1051  defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
1052  defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
1053  defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
1054  defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
1055  defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
1056  defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
1057  defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
1058  defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
1059  defm int_amdgcn_image_atomic_add_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_ADD_FLT">;
1060  defm int_amdgcn_image_atomic_min_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MIN_FLT">;
1061  defm int_amdgcn_image_atomic_max_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MAX_FLT">;
1062
1063  defm int_amdgcn_image_atomic_cmpswap :
1064      AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
1065                                               AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
1066
1067  defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">;
1068  defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">;
1069}
1070
1071//////////////////////////////////////////////////////////////////////////
1072// Buffer intrinsics
1073//////////////////////////////////////////////////////////////////////////
1074
1075// Data type for buffer resources (V#). Maybe, in the future, we can create a
1076// similar one for textures (T#).
1077def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>;
1078
1079let TargetPrefix = "amdgcn" in {
1080
1081def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
1082  [AMDGPUBufferRsrcTy],
1083  [llvm_anyptr_ty, // base
1084   llvm_i16_ty,    // stride (and swizzle control)
1085   llvm_i32_ty,    // NumRecords / extent
1086   llvm_i32_ty],   // flags
1087  // Attributes lifted from ptrmask + some extra argument attributes.
1088  [IntrNoMem, ReadNone<ArgIndex<0>>,
1089   IntrSpeculatable, IntrWillReturn]>;
1090
1091defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
1092
1093// Generate a buffer_load instruction that may be optimized to s_buffer_load if
1094// the offset argument is uniform.
1095def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
1096  [llvm_any_ty],
1097  [llvm_v4i32_ty,    // rsrc(SGPR)
1098   llvm_i32_ty,      // byte offset
1099   llvm_i32_ty],     // auxiliary/cachepolicy(imm):
1100                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1101                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1102                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1103                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1104                     //                bit 6 = swz
1105                     // Note: volatile bit is **not** permitted here.
1106  [IntrNoMem, ImmArg<ArgIndex<2>>]>,
1107  AMDGPURsrcIntrinsic<0>;
1108
1109// Buffer intrinsics with separate raw and struct variants.  The raw
1110// variant never has an index. The struct variant always has an index, even if
1111// it is const 0. A struct intrinsic with constant 0 index is different to the
1112// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
1113// and swizzling changes depending on whether idxen is set in the instruction.
1114// These instrinsics also keep the offset and soffset arguments separate as
1115// they behave differently in bounds checking and swizzling.
1116
1117// The versions of these intrinsics that take <4 x i32> arguments are deprecated
1118// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments,
1119// which allow for improved reasoning about memory accesses.
1120//
1121// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved
1122// through to final assembly selection and is used to signal that the buffer
1123// operation is volatile.
1124class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1125  [data_ty],
1126  [llvm_v4i32_ty,    // rsrc(SGPR)
1127   llvm_i32_ty,      // offset(VGPR/imm, included in bounds checking and swizzling)
1128   llvm_i32_ty,      // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1129   llvm_i32_ty],     // auxiliary/cachepolicy(imm):
1130                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1131                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1132                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1133                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1134                     //                bit 6 = swz
1135                     //           all: volatile op (bit 31, stripped at lowering)
1136  [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
1137  AMDGPURsrcIntrinsic<0>;
1138def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
1139def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
1140
1141class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1142  [data_ty],
1143  [llvm_v4i32_ty,     // rsrc(SGPR)
1144   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1145   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1146   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1147                      //                                       bit 1 = slc,
1148                      //                                       bit 2 = dlc on gfx10+),
1149                      //                      swizzled buffer (bit 3 = swz))
1150  [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1151  AMDGPURsrcIntrinsic<0>;
1152def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;
1153
1154class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1155  [data_ty],
1156  [AMDGPUBufferRsrcTy,    // rsrc(SGPR)
1157   llvm_i32_ty,           // offset(VGPR/imm, included in bounds checking and swizzling)
1158   llvm_i32_ty,           // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1159   llvm_i32_ty],          // auxiliary/cachepolicy(imm):
1160                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1161                          //                bit 3 = swz, bit 4 = scc (gfx90a)
1162                          //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1163                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1164                          //                bit 6 = swz
1165                          //           all: volatile op (bit 31, stripped at lowering)
1166  [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1167  ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
1168  AMDGPURsrcIntrinsic<0>;
1169def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>;
1170def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;
1171
1172class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1173  [data_ty],
1174  [AMDGPUBufferRsrcTy,// rsrc(SGPR)
1175   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1176   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1177   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1178                      //                                       bit 1 = slc,
1179                      //                                       bit 2 = dlc on gfx10+),
1180                      //                      swizzled buffer (bit 3 = swz))
1181  [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1182  AMDGPURsrcIntrinsic<0>;
1183def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad;
1184
1185class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1186  [data_ty],
1187  [llvm_v4i32_ty,    // rsrc(SGPR)
1188   llvm_i32_ty,      // vindex(VGPR)
1189   llvm_i32_ty,      // offset(VGPR/imm, included in bounds checking and swizzling)
1190   llvm_i32_ty,      // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1191   llvm_i32_ty],     // auxiliary/cachepolicy(imm):
1192                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1193                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1194                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1195                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1196                     //                bit 6 = swz
1197                     //           all: volatile op (bit 31, stripped at lowering)
1198  [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1199  AMDGPURsrcIntrinsic<0>;
1200def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
1201def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
1202
1203class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1204  [data_ty],
1205  [AMDGPUBufferRsrcTy,    // rsrc(SGPR)
1206   llvm_i32_ty,           // vindex(VGPR)
1207   llvm_i32_ty,           // offset(VGPR/imm, included in bounds checking and swizzling)
1208   llvm_i32_ty,           // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1209   llvm_i32_ty],          // auxiliary/cachepolicy(imm):
1210                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1211                          //                bit 3 = swz, bit 4 = scc (gfx90a)
1212                          //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1213                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1214                          //                bit 6 = swz
1215                          //           all: volatile op (bit 31, stripped at lowering)
1216  [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1217   ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1218  AMDGPURsrcIntrinsic<0>;
1219def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad;
1220def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad;
1221
1222class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1223  [],
1224  [data_ty,          // vdata(VGPR)
1225   llvm_v4i32_ty,    // rsrc(SGPR)
1226   llvm_i32_ty,      // offset(VGPR/imm, included in bounds checking and swizzling)
1227   llvm_i32_ty,      // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1228   llvm_i32_ty],     // auxiliary/cachepolicy(imm):
1229                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1230                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1231                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1232                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1233                     //                bit 6 = swz
1234                     //           all: volatile op (bit 31, stripped at lowering)
1235  [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1236  AMDGPURsrcIntrinsic<1>;
1237def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
1238def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
1239
1240class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1241  [],
1242  [data_ty,               // vdata(VGPR)
1243   AMDGPUBufferRsrcTy,    // rsrc(SGPR)
1244   llvm_i32_ty,           // offset(VGPR/imm, included in bounds checking and swizzling)
1245   llvm_i32_ty,           // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1246   llvm_i32_ty],          // auxiliary/cachepolicy(imm):
1247                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1248                          //                bit 3 = swz, bit 4 = scc (gfx90a)
1249                          //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1250                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1251                          //                bit 6 = swz
1252                          //           all: volatile op (bit 31, stripped at lowering)
1253  [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1254  ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1255  AMDGPURsrcIntrinsic<1>;
1256def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>;
1257def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore;
1258
1259class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1260  [],
1261  [data_ty,          // vdata(VGPR)
1262   llvm_v4i32_ty,    // rsrc(SGPR)
1263   llvm_i32_ty,      // vindex(VGPR)
1264   llvm_i32_ty,      // offset(VGPR/imm, included in bounds checking and swizzling)
1265   llvm_i32_ty,      // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1266   llvm_i32_ty],     // auxiliary/cachepolicy(imm):
1267                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1268                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1269                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1270                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1271                     //                bit 6 = swz
1272                     //           all: volatile op (bit 31, stripped at lowering)
1273  [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1274  AMDGPURsrcIntrinsic<1>;
1275def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
1276def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
1277
1278class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1279  [],
1280  [data_ty,               // vdata(VGPR)
1281   AMDGPUBufferRsrcTy,    // rsrc(SGPR)
1282   llvm_i32_ty,           // vindex(VGPR)
1283   llvm_i32_ty,           // offset(VGPR/imm, included in bounds checking and swizzling)
1284   llvm_i32_ty,           // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1285   llvm_i32_ty],          // auxiliary/cachepolicy(imm):
1286                          //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1287                          //                bit 3 = swz, bit 4 = scc (gfx90a)
1288                          //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1289                          //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1290                          //                bit 6 = swz
1291                          //           all: volatile op (bit 31, stripped at lowering)
1292  [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1293   ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1294  AMDGPURsrcIntrinsic<1>;
1295def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore;
1296def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore;
1297
1298class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1299  [data_ty],
1300  [LLVMMatchType<0>,  // vdata(VGPR)
1301   llvm_v4i32_ty,     // rsrc(SGPR)
1302   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1303   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1304   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1305  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1306  AMDGPURsrcIntrinsic<1, 0>;
1307def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
1308def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
1309def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
1310def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
1311def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
1312def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1313def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
1314def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
1315def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1316def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
1317def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
1318def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
1319def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
1320def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
1321def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
1322def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
1323  [llvm_anyint_ty],
1324  [LLVMMatchType<0>,  // src(VGPR)
1325   LLVMMatchType<0>,  // cmp(VGPR)
1326   llvm_v4i32_ty,     // rsrc(SGPR)
1327   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1328   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1329   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1330  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1331  AMDGPURsrcIntrinsic<2, 0>;
1332
1333class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1334  [data_ty],
1335  [LLVMMatchType<0>,            // vdata(VGPR)
1336   AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1337   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1338   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1339   llvm_i32_ty],                // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1340  [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1341   ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1342  AMDGPURsrcIntrinsic<1, 0>;
1343
1344def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic;
1345def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic;
1346def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic;
1347def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic;
1348def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic;
1349def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1350def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic;
1351def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic;
1352def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1353def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic;
1354def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
1355def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
1356def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
1357def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
1358def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
1359def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
1360  [llvm_anyint_ty],
1361  [LLVMMatchType<0>,  // src(VGPR)
1362   LLVMMatchType<0>,  // cmp(VGPR)
1363   AMDGPUBufferRsrcTy, // rsrc(SGPR)
1364   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1365   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1366   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1367  [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1368   ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1369  AMDGPURsrcIntrinsic<2, 0>;
1370
1371// gfx908 intrinsic
1372def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1373
1374// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx12+.
1375def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1376
1377class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1378  [data_ty],
1379  [LLVMMatchType<0>,  // vdata(VGPR)
1380   llvm_v4i32_ty,     // rsrc(SGPR)
1381   llvm_i32_ty,       // vindex(VGPR)
1382   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1383   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1384   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1385  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1386  AMDGPURsrcIntrinsic<1, 0>;
1387def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
1388def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
1389def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
1390def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
1391def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
1392def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
1393def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
1394def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
1395def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
1396def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
1397def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
1398def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
1399def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
1400def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
1401  [llvm_anyint_ty],
1402  [LLVMMatchType<0>,  // src(VGPR)
1403   LLVMMatchType<0>,  // cmp(VGPR)
1404   llvm_v4i32_ty,     // rsrc(SGPR)
1405   llvm_i32_ty,       // vindex(VGPR)
1406   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1407   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1408   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1409  [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1410  AMDGPURsrcIntrinsic<2, 0>;
1411
1412class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1413  [data_ty],
1414  [LLVMMatchType<0>,            // vdata(VGPR)
1415   AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1416   llvm_i32_ty,                 // vindex(VGPR)
1417   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1418   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1419   llvm_i32_ty],                // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1420  [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1421   ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1422  AMDGPURsrcIntrinsic<1, 0>;
1423def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic;
1424def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic;
1425def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic;
1426def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic;
1427def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic;
1428def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic;
1429def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic;
1430def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic;
1431def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic;
1432def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
1433def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
1434def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
1435def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
1436def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
1437  [llvm_anyint_ty],
1438  [LLVMMatchType<0>,  // src(VGPR)
1439   LLVMMatchType<0>,  // cmp(VGPR)
1440   AMDGPUBufferRsrcTy, // rsrc(SGPR)
1441   llvm_i32_ty,       // vindex(VGPR)
1442   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1443   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1444   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1445  [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1446   ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1447  AMDGPURsrcIntrinsic<2, 0>;
1448
1449// gfx908 intrinsic
1450def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1451def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1452
1453// gfx90a intrinsics
1454def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1455def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1456
1457def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1458def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1459
1460// tbuffer intrinsics, with:
1461// - raw and struct variants
1462// - joint format field
1463// - joint cachepolicy field
1464def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic <
1465    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1466    [llvm_v4i32_ty,   // rsrc(SGPR)
1467     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1468     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1469     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1470     llvm_i32_ty],    // auxiliary/cachepolicy(imm):
1471                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1472                      //                bit 3 = swz, bit 4 = scc (gfx90a)
1473                      //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1474                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1475                      //                bit 6 = swz
1476    [IntrReadMem,
1477     ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1478  AMDGPURsrcIntrinsic<0>;
1479
1480def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic <
1481    [llvm_any_ty],       // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1482    [AMDGPUBufferRsrcTy, // rsrc(SGPR)
1483     llvm_i32_ty,       // offset(VGPR/imm, included in bounds` checking and swizzling)
1484     llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1485     llvm_i32_ty,       // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1486     llvm_i32_ty],      // auxiliary/cachepolicy(imm):
1487                        //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1488                        //                bit 3 = swz, bit 4 = scc (gfx90a)
1489                        //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1490                        //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1491                        //                bit 6 = swz
1492                        //           all: volatile op (bit 31, stripped at lowering)
1493    [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1494     ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1495  AMDGPURsrcIntrinsic<0>;
1496
1497def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic <
1498    [],
1499    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1500     llvm_v4i32_ty,  // rsrc(SGPR)
1501     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1502     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1503     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1504     llvm_i32_ty],   // auxiliary/cachepolicy(imm):
1505                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1506                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1507                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1508                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1509                     //                bit 6 = swz
1510                     //           all: volatile op (bit 31, stripped at lowering)
1511    [IntrWriteMem,
1512     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1513  AMDGPURsrcIntrinsic<1>;
1514
1515def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic <
1516    [],
1517    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1518     AMDGPUBufferRsrcTy, // rsrc(SGPR)
1519     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1520     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1521     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1522     llvm_i32_ty],   // auxiliary/cachepolicy(imm):
1523                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1524                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1525                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1526                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1527                     //                bit 6 = swz
1528                     //           all: volatile op (bit 31, stripped at lowering)
1529    [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1530     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1531  AMDGPURsrcIntrinsic<1>;
1532
1533def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic <
1534    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1535    [llvm_v4i32_ty,   // rsrc(SGPR)
1536     llvm_i32_ty,     // vindex(VGPR)
1537     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1538     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1539     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1540     llvm_i32_ty],    // auxiliary/cachepolicy(imm):
1541                      //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1542                      //                bit 3 = swz, bit 4 = scc (gfx90a)
1543                      //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1544                      //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1545                      //                bit 6 = swz
1546                      //           all: volatile op (bit 31, stripped at lowering)
1547    [IntrReadMem,
1548     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1549  AMDGPURsrcIntrinsic<0>;
1550
1551def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic <
1552    [llvm_any_ty],       // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1553    [AMDGPUBufferRsrcTy, // rsrc(SGPR)
1554     llvm_i32_ty,        // vindex(VGPR)
1555     llvm_i32_ty,        // offset(VGPR/imm, included in bounds checking and swizzling)
1556     llvm_i32_ty,        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1557     llvm_i32_ty,        // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1558     llvm_i32_ty],       // auxiliary/cachepolicy(imm):
1559                         //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1560                         //                bit 3 = swz, bit 4 = scc (gfx90a)
1561                         //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1562                         //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1563                         //                bit 6 = swz
1564                         //           all: volatile op (bit 31, stripped at lowering)
1565    [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1566     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1567  AMDGPURsrcIntrinsic<0>;
1568
1569def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic <
1570    [],
1571    [llvm_any_ty,        // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1572     AMDGPUBufferRsrcTy, // rsrc(SGPR)
1573     llvm_i32_ty,        // vindex(VGPR)
1574     llvm_i32_ty,        // offset(VGPR/imm, included in bounds checking and swizzling)
1575     llvm_i32_ty,        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1576     llvm_i32_ty,        // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1577     llvm_i32_ty],       // auxiliary/cachepolicy(imm):
1578                         //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1579                         //                bit 3 = swz, bit 4 = scc (gfx90a)
1580                         //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1581                         //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1582                         //                bit 6 = swz
1583                         //           all: volatile op (bit 31, stripped at lowering)
1584    [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1585     ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
1586  AMDGPURsrcIntrinsic<1>;
1587
1588def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
1589    [],
1590    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1591     llvm_v4i32_ty,  // rsrc(SGPR)
1592     llvm_i32_ty,    // vindex(VGPR)
1593     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1594     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1595     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1596     llvm_i32_ty],   // auxiliary/cachepolicy(imm):
1597                     //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1598                     //                bit 3 = swz, bit 4 = scc (gfx90a)
1599                     //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1600                     //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1601                     //                bit 6 = swz
1602                     //           all: volatile op (bit 31, stripped at lowering)
1603    [IntrWriteMem,
1604     ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
1605  AMDGPURsrcIntrinsic<1>;
1606
1607class AMDGPURawBufferLoadLDS : Intrinsic <
1608  [],
1609  [llvm_v4i32_ty,             // rsrc(SGPR)
1610   LLVMQualPointerType<3>,    // LDS base offset
1611   llvm_i32_ty,               // Data byte size: 1/2/4
1612   llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
1613   llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1614   llvm_i32_ty,               // imm offset(imm, included in bounds checking and swizzling)
1615   llvm_i32_ty],              // auxiliary/cachepolicy(imm):
1616                              //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1617                              //                bit 3 = swz, bit 4 = scc (gfx90a)
1618                              //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1619                              //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1620                              //                bit 6 = swz
1621                              //           all: volatile op (bit 31, stripped at lowering)
1622  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1623   ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1624def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
1625
1626class AMDGPURawPtrBufferLoadLDS : Intrinsic <
1627  [],
1628  [AMDGPUBufferRsrcTy,        // rsrc(SGPR)
1629   LLVMQualPointerType<3>,    // LDS base offset
1630   llvm_i32_ty,               // Data byte size: 1/2/4
1631   llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
1632   llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1633   llvm_i32_ty,               // imm offset(imm, included in bounds checking and swizzling)
1634   llvm_i32_ty],              // auxiliary/cachepolicy(imm):
1635                              //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1636                              //                bit 3 = swz, bit 4 = scc (gfx90a)
1637                              //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1638                              //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1639                              //                bit 6 = swz
1640                              //           all: volatile op (bit 31, stripped at lowering)
1641  [IntrWillReturn, IntrArgMemOnly,
1642   ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1643   WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1644   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1645   ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1646def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS;
1647
1648class AMDGPUStructBufferLoadLDS : Intrinsic <
1649  [],
1650  [llvm_v4i32_ty,             // rsrc(SGPR)
1651   LLVMQualPointerType<3>,    // LDS base offset
1652   llvm_i32_ty,               // Data byte size: 1/2/4
1653   llvm_i32_ty,               // vindex(VGPR)
1654   llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
1655   llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1656   llvm_i32_ty,               // imm offset(imm, included in bounds checking and swizzling)
1657   llvm_i32_ty],              // auxiliary/cachepolicy(imm):
1658                              //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1659                              //                bit 3 = swz, bit 4 = scc (gfx90a)
1660                              //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1661                              //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1662                              //                bit 6 = swz
1663                              //           all: volatile op (bit 31, stripped at lowering)
1664  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1665   ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1666def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
1667
1668class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
1669  [],
1670  [AMDGPUBufferRsrcTy,        // rsrc(SGPR)
1671   LLVMQualPointerType<3>,    // LDS base offset
1672   llvm_i32_ty,               // Data byte size: 1/2/4
1673   llvm_i32_ty,               // vindex(VGPR)
1674   llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
1675   llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1676   llvm_i32_ty,               // imm offset(imm, included in bounds checking and swizzling)
1677   llvm_i32_ty],              // auxiliary/cachepolicy(imm):
1678                              //                bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
1679                              //                bit 3 = swz, bit 4 = scc (gfx90a)
1680                              //        gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1681                              //        gfx12+: bits [0-2] = th, bits [3-4] = scope,
1682                              //                bit 6 = swz
1683                              //           all: volatile op (bit 31, stripped at lowering)
1684  [IntrWillReturn, IntrArgMemOnly,
1685   ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1686   WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1687   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1688   ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1689def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS;
1690
1691} // defset AMDGPUBufferIntrinsics
1692
1693// Uses that do not set the done bit should set IntrWriteMem on the
1694// call site.
1695def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [
1696  llvm_i32_ty,       // tgt,
1697  llvm_i32_ty,       // en
1698  llvm_any_ty,       // src0 (f32 or i32)
1699  LLVMMatchType<0>,  // src1
1700  LLVMMatchType<0>,  // src2
1701  LLVMMatchType<0>,  // src3
1702  llvm_i1_ty,        // done
1703  llvm_i1_ty         // vm (ignored on GFX11+)
1704  ],
1705  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1706   ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly]
1707>;
1708
1709// exp with row_en bit set. Only supported on GFX11+.
1710def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [
1711  llvm_i32_ty,       // tgt,
1712  llvm_i32_ty,       // en
1713  llvm_any_ty,       // src0 (f32 or i32)
1714  LLVMMatchType<0>,  // src1
1715  LLVMMatchType<0>,  // src2
1716  LLVMMatchType<0>,  // src3
1717  llvm_i1_ty,        // done
1718  llvm_i32_ty],      // row number
1719  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1720   IntrWriteMem, IntrInaccessibleMemOnly]
1721>;
1722
1723// exp with compr bit set. Not supported on GFX11+.
1724def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [
1725  llvm_i32_ty,       // tgt,
1726  llvm_i32_ty,       // en
1727  llvm_anyvector_ty, // src0 (v2f16 or v2i16)
1728  LLVMMatchType<0>,  // src1
1729  llvm_i1_ty,        // done
1730  llvm_i1_ty],       // vm
1731  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>,
1732   ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly]
1733>;
1734
1735def int_amdgcn_buffer_wbinvl1_sc :
1736  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
1737  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
1738
1739def int_amdgcn_buffer_wbinvl1 :
1740  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
1741  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
1742
1743def int_amdgcn_s_dcache_inv :
1744  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">,
1745  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
1746
1747def int_amdgcn_s_memtime :
1748  ClangBuiltin<"__builtin_amdgcn_s_memtime">,
1749  DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>;
1750
1751def int_amdgcn_s_sleep :
1752  ClangBuiltin<"__builtin_amdgcn_s_sleep">,
1753  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1754                                IntrHasSideEffects]> {
1755}
1756
1757def int_amdgcn_s_sleep_var
1758    : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">,
1759      Intrinsic<[], [llvm_i32_ty],
1760                [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> {
1761}
1762
1763def int_amdgcn_s_nop :
1764  DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1765                                IntrHasSideEffects]> {
1766}
1767
1768def int_amdgcn_s_incperflevel :
1769  ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
1770  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1771                                IntrHasSideEffects]> {
1772}
1773
1774def int_amdgcn_s_decperflevel :
1775  ClangBuiltin<"__builtin_amdgcn_s_decperflevel">,
1776  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1777                                IntrHasSideEffects]> {
1778}
1779
1780def int_amdgcn_s_sethalt :
1781  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1782                                IntrHasSideEffects]>;
1783
1784def int_amdgcn_s_setprio :
1785  ClangBuiltin<"__builtin_amdgcn_s_setprio">,
1786  DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1787                                IntrHasSideEffects]>;
1788
1789def int_amdgcn_s_ttracedata :
1790  ClangBuiltin<"__builtin_amdgcn_s_ttracedata">,
1791  DefaultAttrsIntrinsic<[], [llvm_i32_ty],
1792                        [IntrNoMem, IntrHasSideEffects]>;
1793
1794def int_amdgcn_s_ttracedata_imm :
1795  ClangBuiltin<"__builtin_amdgcn_s_ttracedata_imm">,
1796  DefaultAttrsIntrinsic<[], [llvm_i16_ty],
1797                        [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
1798
1799// This is IntrHasSideEffects so it can be used to read cycle counters.
1800def int_amdgcn_s_getreg :
1801  ClangBuiltin<"__builtin_amdgcn_s_getreg">,
1802  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty],
1803  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
1804>;
1805
1806// Note this can be used to set FP environment properties that are
1807// unsafe to change in non-strictfp functions. The register properties
1808// available (and value required to access them) may differ per
1809// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
1810def int_amdgcn_s_setreg :
1811  ClangBuiltin<"__builtin_amdgcn_s_setreg">,
1812  DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty],
1813  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
1814>;
1815
1816// int_amdgcn_s_getpc is provided to allow a specific style of position
1817// independent code to determine the high part of its address when it is
1818// known (through convention) that the code and any data of interest does
1819// not cross a 4Gb address boundary. Use for any other purpose may not
1820// produce the desired results as optimizations may cause code movement,
1821// especially as we explicitly use IntrNoMem to allow optimizations.
1822// This intrinsic always returns PC sign-extended from 48 bits even if the
1823// s_getpc_b64 instruction returns a zero-extended value.
1824def int_amdgcn_s_getpc :
1825  ClangBuiltin<"__builtin_amdgcn_s_getpc">,
1826  DefaultAttrsIntrinsic<[llvm_i64_ty], [], [NoUndef<RetIndex>, IntrNoMem,
1827                                IntrSpeculatable, IntrWillReturn]>;
1828
1829// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
1830// param values: 0 = P10, 1 = P20, 2 = P0
1831def int_amdgcn_interp_mov :
1832  ClangBuiltin<"__builtin_amdgcn_interp_mov">,
1833  DefaultAttrsIntrinsic<[llvm_float_ty],
1834            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1835            [IntrNoMem, IntrSpeculatable,
1836              ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1837
1838// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
1839// This intrinsic reads from lds, but the memory values are constant,
1840// so it behaves like IntrNoMem.
1841def int_amdgcn_interp_p1 :
1842  ClangBuiltin<"__builtin_amdgcn_interp_p1">,
1843  DefaultAttrsIntrinsic<[llvm_float_ty],
1844            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1845            [IntrNoMem, IntrSpeculatable,
1846             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1847
1848// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
1849def int_amdgcn_interp_p2 :
1850  ClangBuiltin<"__builtin_amdgcn_interp_p2">,
1851  DefaultAttrsIntrinsic<[llvm_float_ty],
1852            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1853            [IntrNoMem, IntrSpeculatable,
1854             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1855          // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
1856
1857// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
1858// high selects whether high or low 16-bits are loaded from LDS
1859def int_amdgcn_interp_p1_f16 :
1860  ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">,
1861  DefaultAttrsIntrinsic<[llvm_float_ty],
1862            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1863            [IntrNoMem, IntrSpeculatable,
1864             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1865
1866// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
1867// high selects whether high or low 16-bits are loaded from LDS
1868def int_amdgcn_interp_p2_f16 :
1869  ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">,
1870  DefaultAttrsIntrinsic<[llvm_half_ty],
1871            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1872            [IntrNoMem, IntrSpeculatable,
1873             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1874
1875// llvm.amdgcn.lds.direct.load <m0>
1876// The input argument is m0, which contains a packed combination of address
1877// offset and flags describing the data type.
1878def int_amdgcn_lds_direct_load :
1879  DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
1880            [llvm_i32_ty],
1881            [IntrReadMem, IntrSpeculatable]>;
1882
1883// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0>
1884// Like interp intrinsics, this reads from lds, but the memory values are constant,
1885// so it behaves like IntrNoMem.
1886def int_amdgcn_lds_param_load :
1887  DefaultAttrsIntrinsic<[llvm_float_ty],
1888            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1889            [IntrNoMem, IntrSpeculatable,
1890             ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
1891
1892// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0>
1893def int_amdgcn_interp_inreg_p10 :
1894  DefaultAttrsIntrinsic<[llvm_float_ty],
1895            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
1896            [IntrNoMem, IntrSpeculatable]>;
1897
1898// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp>
1899def int_amdgcn_interp_inreg_p2 :
1900  DefaultAttrsIntrinsic<[llvm_float_ty],
1901            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
1902            [IntrNoMem, IntrSpeculatable]>;
1903
1904// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high>
1905// high selects whether high or low 16-bits are used for p and p0 operands
1906def int_amdgcn_interp_inreg_p10_f16:
1907  DefaultAttrsIntrinsic<[llvm_float_ty],
1908            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1909            [IntrNoMem, IntrSpeculatable,
1910             ImmArg<ArgIndex<3>>]>;
1911
1912// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high>
1913// high selects whether high or low 16-bits are used for p operand
1914def int_amdgcn_interp_inreg_p2_f16 :
1915  DefaultAttrsIntrinsic<[llvm_half_ty],
1916            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1917            [IntrNoMem, IntrSpeculatable,
1918             ImmArg<ArgIndex<3>>]>;
1919
1920// llvm.amdgcn.interp.p10.rtz.f16 <p>, <i>, <p0>, <high>
1921// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1922// high selects whether high or low 16-bits are used for p and p0 operands
1923def int_amdgcn_interp_p10_rtz_f16:
1924  DefaultAttrsIntrinsic<[llvm_float_ty],
1925            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1926            [IntrNoMem, IntrSpeculatable,
1927             ImmArg<ArgIndex<3>>]>;
1928
1929// llvm.amdgcn.interp.p2.rtz.f16 <p>, <j>, <tmp>, <high>
1930// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
1931// high selects whether high or low 16-bits are used for p operand
1932def int_amdgcn_interp_p2_rtz_f16 :
1933  DefaultAttrsIntrinsic<[llvm_half_ty],
1934            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1935            [IntrNoMem, IntrSpeculatable,
1936             ImmArg<ArgIndex<3>>]>;
1937
1938// Deprecated: use llvm.amdgcn.live.mask instead.
1939def int_amdgcn_ps_live : DefaultAttrsIntrinsic <
1940  [llvm_i1_ty],
1941  [],
1942  [IntrNoMem]>;
1943
1944// Query currently live lanes.
1945// Returns true if lane is live (and not a helper lane).
1946def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty],
1947  [], [NoUndef<RetIndex>, IntrReadMem, IntrInaccessibleMemOnly]
1948>;
1949
1950def int_amdgcn_mbcnt_lo :
1951  ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
1952  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1953   [IntrNoMem]>;
1954
1955def int_amdgcn_mbcnt_hi :
1956  ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
1957  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1958            [IntrNoMem]>;
1959
1960// llvm.amdgcn.ds.swizzle src offset
1961def int_amdgcn_ds_swizzle :
1962  ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
1963  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1964            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree,
1965             ImmArg<ArgIndex<1>>]>;
1966
1967def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
1968    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1969    [IntrNoMem, IntrSpeculatable]
1970>;
1971
1972def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
1973    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1974    [IntrNoMem, IntrSpeculatable]
1975>;
1976
1977def int_amdgcn_lerp :
1978  ClangBuiltin<"__builtin_amdgcn_lerp">,
1979  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1980  [IntrNoMem, IntrSpeculatable]
1981>;
1982
1983def int_amdgcn_sad_u8 :
1984  ClangBuiltin<"__builtin_amdgcn_sad_u8">,
1985  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1986  [IntrNoMem, IntrSpeculatable]
1987>;
1988
1989def int_amdgcn_msad_u8 :
1990  ClangBuiltin<"__builtin_amdgcn_msad_u8">,
1991  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1992  [IntrNoMem, IntrSpeculatable]
1993>;
1994
1995def int_amdgcn_sad_hi_u8 :
1996  ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
1997  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1998  [IntrNoMem, IntrSpeculatable]
1999>;
2000
2001def int_amdgcn_sad_u16 :
2002  ClangBuiltin<"__builtin_amdgcn_sad_u16">,
2003  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2004  [IntrNoMem, IntrSpeculatable]
2005>;
2006
2007def int_amdgcn_qsad_pk_u16_u8 :
2008  ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
2009  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
2010  [IntrNoMem, IntrSpeculatable]
2011>;
2012
2013def int_amdgcn_mqsad_pk_u16_u8 :
2014  ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
2015  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
2016  [IntrNoMem, IntrSpeculatable]
2017>;
2018
2019def int_amdgcn_mqsad_u32_u8 :
2020  ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
2021  DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
2022  [IntrNoMem, IntrSpeculatable]
2023>;
2024
2025def int_amdgcn_cvt_pk_u8_f32 :
2026  ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
2027  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
2028  [IntrNoMem, IntrSpeculatable]
2029>;
2030
2031def int_amdgcn_icmp :
2032  Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
2033            [IntrNoMem, IntrConvergent,
2034             ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2035
2036def int_amdgcn_fcmp :
2037  Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
2038            [IntrNoMem, IntrConvergent,
2039             ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2040
2041def int_amdgcn_ballot :
2042  Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
2043            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2044
2045def int_amdgcn_inverse_ballot :
2046  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
2047            [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2048
2049// Lowers to S_BITREPLICATE_B64_B32.
2050// The argument must be uniform; otherwise, the result is undefined.
2051def int_amdgcn_s_bitreplicate :
2052  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
2053
2054// Lowers to S_QUADMASK_B{32,64}
2055// The argument must be uniform; otherwise, the result is undefined.
2056def int_amdgcn_s_quadmask :
2057  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
2058
2059// Lowers to S_WQM_B{32,64}
2060// The argument must be uniform; otherwise, the result is undefined.
2061// Does not set WQM; merely calculates the bitmask.
2062def int_amdgcn_s_wqm :
2063  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
2064
2065class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
2066    [data_ty],
2067    [
2068      LLVMMatchType<0>,   // llvm value to reduce (SGPR/VGPR)
2069      llvm_i32_ty         // Reduction Strategy Switch for lowering ( 0: Default,
2070                          //                                          1: Iterative strategy, and
2071                          //                                          2. DPP)
2072    ],
2073    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
2074
2075def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2076def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2077
2078def int_amdgcn_readfirstlane :
2079  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
2080            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2081
2082// The lane argument must be uniform across the currently active threads of the
2083// current wave. Otherwise, the result is undefined.
2084def int_amdgcn_readlane :
2085  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
2086            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2087
2088// The value to write and lane select arguments must be uniform across the
2089// currently active threads of the current wave. Otherwise, the result is
2090// undefined.
2091def int_amdgcn_writelane :
2092  Intrinsic<[llvm_any_ty], [
2093    LLVMMatchType<0>,   // uniform value to write: returned by the selected lane
2094    llvm_i32_ty,        // uniform lane select
2095    LLVMMatchType<0>    // returned by all lanes other than the selected one
2096  ],
2097  [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2098>;
2099
2100def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
2101  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2102  [IntrNoMem, IntrSpeculatable]
2103>;
2104
2105// mul24 intrinsics can return i32 or i64.
2106// When returning i64, they're lowered to a mul24/mulhi24 pair.
2107def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
2108  [llvm_i32_ty, llvm_i32_ty],
2109  [IntrNoMem, IntrSpeculatable]
2110>;
2111
2112def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
2113  [llvm_i32_ty, llvm_i32_ty],
2114  [IntrNoMem, IntrSpeculatable]
2115>;
2116
2117def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
2118  [llvm_i32_ty, llvm_i32_ty],
2119  [IntrNoMem, IntrSpeculatable]
2120>;
2121
2122def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
2123  [llvm_i32_ty, llvm_i32_ty],
2124  [IntrNoMem, IntrSpeculatable]
2125>;
2126
2127// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
2128//
2129// bar_val is the total number of waves that will wait on this
2130// barrier, minus 1.
2131def int_amdgcn_ds_gws_init :
2132  ClangBuiltin<"__builtin_amdgcn_ds_gws_init">,
2133  Intrinsic<[],
2134  [llvm_i32_ty, llvm_i32_ty],
2135  [IntrConvergent, IntrWriteMem,
2136   IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2137  [SDNPMemOperand]
2138>;
2139
2140// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
2141// bar_val is the total number of waves that will wait on this
2142// barrier, minus 1.
2143def int_amdgcn_ds_gws_barrier :
2144  ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
2145  Intrinsic<[],
2146  [llvm_i32_ty, llvm_i32_ty],
2147  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2148  [SDNPMemOperand]
2149>;
2150
2151// llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
2152def int_amdgcn_ds_gws_sema_v :
2153  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
2154  Intrinsic<[],
2155  [llvm_i32_ty],
2156  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2157  [SDNPMemOperand]
2158>;
2159
2160// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
2161def int_amdgcn_ds_gws_sema_br :
2162  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
2163  Intrinsic<[],
2164  [llvm_i32_ty, llvm_i32_ty],
2165  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2166  [SDNPMemOperand]
2167>;
2168
2169// llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
2170def int_amdgcn_ds_gws_sema_p :
2171  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
2172  Intrinsic<[],
2173  [llvm_i32_ty],
2174  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2175  [SDNPMemOperand]
2176>;
2177
2178// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
2179def int_amdgcn_ds_gws_sema_release_all :
2180  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
2181  Intrinsic<[],
2182  [llvm_i32_ty],
2183  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2184  [SDNPMemOperand]
2185>;
2186
2187
2188// Copies the source value to the destination value, with the guarantee that
2189// the source value is computed as if the entire program were executed in WQM.
2190def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
2191  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]
2192>;
2193
2194// Copies the source value to the destination value, such that the source
2195// is computed as if the entire program were executed in WQM if any other
2196// program code executes in WQM.
2197def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
2198  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]
2199>;
2200
2201// Return true if at least one thread within the pixel quad passes true into
2202// the function.
2203def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
2204  [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2205>;
2206
2207// If false, set EXEC=0 for the current thread until the end of program.
2208// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
2209def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>;
2210
2211def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">,
2212  Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent,
2213                     IntrNoCallback, IntrNoFree]
2214>;
2215
2216// If false, mark all active lanes as helper lanes until the end of program.
2217def int_amdgcn_wqm_demote : Intrinsic<[],
2218  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree]
2219>;
2220
2221// Copies the active channels of the source value to the destination value,
2222// with the guarantee that the source value is computed as if the entire
2223// program were executed in Whole Wavefront Mode, i.e. with all channels
2224// enabled, with a few exceptions: - Phi nodes which require WWM return an
2225// undefined value.
2226def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty],
2227  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
2228                       IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2229>;
2230// Deprecated. Use int_amdgcn_strict_wwm instead.
2231def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
2232  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
2233                       IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2234>;
2235def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
2236  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
2237                       IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2238>;
2239
2240// Given a value, copies it while setting all the inactive lanes to a given
2241// value. Note that OpenGL helper lanes are considered active, so if the
2242// program ever uses WQM, then the instruction and the first source will be
2243// computed in WQM.
2244def int_amdgcn_set_inactive :
2245  Intrinsic<[llvm_any_ty],
2246            [LLVMMatchType<0>, // value to be copied
2247             LLVMMatchType<0>], // value for the inactive lanes to take
2248            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2249
2250// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must
2251// be a VGPR function argument.
2252// Can only be used in functions with the `amdgpu_cs_chain` or
2253// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control
2254// flow.
2255def int_amdgcn_set_inactive_chain_arg :
2256  Intrinsic<[llvm_anyint_ty],
2257            [LLVMMatchType<0>, // value to be copied
2258             LLVMMatchType<0>], // value for the inactive lanes to take
2259            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2260
2261// Return if the given flat pointer points to a local memory address.
2262def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
2263  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
2264  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
2265>;
2266
2267// Return if the given flat pointer points to a prvate memory address.
2268def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
2269  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
2270  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
2271>;
2272
2273// A uniform tail call to a function with the `amdgpu_cs_chain` or
2274// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs
2275// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to
2276// the given function.
2277// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or
2278// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control
2279// flow.
2280def int_amdgcn_cs_chain:
2281  Intrinsic<[],
2282            [llvm_anyptr_ty, // The function to jump to.
2283             llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64).
2284             llvm_any_ty, // Arguments that will be copied into SGPRs (s0+).
2285                          // Must be uniform.
2286             llvm_any_ty, // Arguments that will be copied into VGPRs (v8+).
2287                          // Need not be uniform.
2288             llvm_i32_ty, // Flags.
2289             llvm_vararg_ty // Additional arguments. Only present if Flags is
2290                            // non-zero.
2291            ],
2292            [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
2293
2294
2295//===----------------------------------------------------------------------===//
2296// CI+ Intrinsics
2297//===----------------------------------------------------------------------===//
2298
2299def int_amdgcn_s_dcache_inv_vol :
2300  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
2301  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
2302
2303def int_amdgcn_buffer_wbinvl1_vol :
2304  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
2305  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
2306
2307//===----------------------------------------------------------------------===//
2308// VI Intrinsics
2309//===----------------------------------------------------------------------===//
2310
2311// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
2312def int_amdgcn_mov_dpp :
2313  Intrinsic<[llvm_anyint_ty],
2314            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
2315             llvm_i1_ty],
2316             [IntrNoMem, IntrConvergent, IntrWillReturn,
2317             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
2318             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2319
2320// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
2321// Should be equivalent to:
2322// v_mov_b32 <dest> <old>
2323// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
2324def int_amdgcn_update_dpp :
2325  Intrinsic<[llvm_any_ty],
2326            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
2327            llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
2328             [IntrNoMem, IntrConvergent, IntrWillReturn,
2329              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
2330              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2331
2332def int_amdgcn_s_dcache_wb :
2333  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">,
2334  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2335
2336def int_amdgcn_s_dcache_wb_vol :
2337  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
2338  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2339
2340def int_amdgcn_s_memrealtime :
2341  ClangBuiltin<"__builtin_amdgcn_s_memrealtime">,
2342  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2343
2344// llvm.amdgcn.ds.permute <index> <src>
2345def int_amdgcn_ds_permute :
2346  ClangBuiltin<"__builtin_amdgcn_ds_permute">,
2347  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2348    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2349
2350// llvm.amdgcn.ds.bpermute <index> <src>
2351def int_amdgcn_ds_bpermute :
2352  ClangBuiltin<"__builtin_amdgcn_ds_bpermute">,
2353  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2354     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2355
2356// llvm.amdgcn.perm <src0> <src1> <selector>
2357def int_amdgcn_perm :
2358  ClangBuiltin<"__builtin_amdgcn_perm">,
2359  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2360     [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2361
2362//===----------------------------------------------------------------------===//
2363// GFX9 Intrinsics
2364//===----------------------------------------------------------------------===//
2365
2366class AMDGPUGlobalLoadLDS :
2367  ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
2368  Intrinsic <
2369    [],
2370    [LLVMQualPointerType<1>,            // Base global pointer to load from
2371     LLVMQualPointerType<3>,            // LDS base pointer to store to
2372     llvm_i32_ty,                       // Data byte size: 1/2/4
2373     llvm_i32_ty,                       // imm offset (applied to both global and LDS address)
2374     llvm_i32_ty],                      // auxiliary data (imm, cachepolicy (bit 0 = sc0,
2375                                        //                                   bit 1 = sc1,
2376                                        //                                   bit 4 = scc))
2377    [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2378     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2379     "", [SDNPMemOperand]>;
2380def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
2381
2382// This is IntrHasSideEffects because it reads from a volatile hardware register.
2383def int_amdgcn_pops_exiting_wave_id :
2384  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
2385
2386//===----------------------------------------------------------------------===//
2387// GFX10 Intrinsics
2388//===----------------------------------------------------------------------===//
2389
2390// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
2391def int_amdgcn_permlane16 :
2392  Intrinsic<[llvm_any_ty],
2393            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2394            [IntrNoMem, IntrConvergent, IntrWillReturn,
2395             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2396
2397// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
2398def int_amdgcn_permlanex16 :
2399  Intrinsic<[llvm_any_ty],
2400            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2401            [IntrNoMem, IntrConvergent, IntrWillReturn,
2402             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2403
2404// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
2405// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
2406// the lanes to read from.
2407def int_amdgcn_mov_dpp8 :
2408  Intrinsic<[llvm_anyint_ty],
2409            [LLVMMatchType<0>, llvm_i32_ty],
2410            [IntrNoMem, IntrConvergent, IntrWillReturn,
2411             ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>;
2412
2413def int_amdgcn_s_get_waveid_in_workgroup :
2414  ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
2415  Intrinsic<[llvm_i32_ty], [],
2416    [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2417
2418class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
2419  [vt],
2420  [pt,  // vaddr
2421   vt], // vdata(VGPR)
2422  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
2423  [SDNPMemOperand]>;
2424
2425def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
2426
2427// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
2428//                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
2429// <node_ptr> is i32 or i64.
2430// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
2431def int_amdgcn_image_bvh_intersect_ray :
2432  DefaultAttrsIntrinsic<[llvm_v4i32_ty],
2433            [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
2434             LLVMMatchType<1>, llvm_v4i32_ty],
2435            [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2436
2437//===----------------------------------------------------------------------===//
2438// GFX11 Intrinsics
2439//===----------------------------------------------------------------------===//
2440
2441// llvm.amdgcn.permlane64 <src0>
2442def int_amdgcn_permlane64 :
2443  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
2444            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2445
2446def int_amdgcn_ds_add_gs_reg_rtn :
2447  ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
2448  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
2449            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
2450            "", [SDNPMemOperand]>;
2451
2452def int_amdgcn_ds_sub_gs_reg_rtn :
2453  ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
2454  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
2455            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
2456            "", [SDNPMemOperand]>;
2457
2458def int_amdgcn_ds_bvh_stack_rtn :
2459  Intrinsic<
2460    [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
2461    [
2462      llvm_i32_ty,   // %addr
2463      llvm_i32_ty,   // %data0
2464      llvm_v4i32_ty, // %data1
2465      llvm_i32_ty,   // %offset
2466    ],
2467    [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2468  >;
2469
2470def int_amdgcn_s_wait_event_export_ready :
2471  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
2472  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
2473>;
2474
2475// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2476//
2477// These operations perform a matrix multiplication and accumulation of
2478// the form: D = A * B + C .
2479
2480class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
2481  Intrinsic<
2482    [CD], // %D
2483    [
2484      AB,               // %A
2485      LLVMMatchType<1>, // %B
2486      LLVMMatchType<0>, // %C
2487    ],
2488    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2489>;
2490
2491class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
2492  Intrinsic<
2493    [CD], // %D
2494    [
2495      AB,               // %A
2496      LLVMMatchType<1>, // %B
2497      LLVMMatchType<0>, // %C
2498      llvm_i1_ty,       // %high (op_sel) for GFX11, 0 for GFX12
2499    ],
2500    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2501>;
2502
2503class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
2504  Intrinsic<
2505    [CD], // %D
2506    [
2507      llvm_i1_ty,       // %A_sign
2508      AB,               // %A
2509      llvm_i1_ty,       // %B_sign
2510      LLVMMatchType<1>, // %B
2511      LLVMMatchType<0>, // %C
2512      llvm_i1_ty,       // %clamp
2513    ],
2514    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2515>;
2516
2517// WMMA GFX11Only
2518
2519// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
2520// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
2521// The content of the other 16-bit half is preserved from the input.
2522
2523defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = {
2524def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
2525def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
2526
2527// WMMA GFX11Plus
2528
2529def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
2530def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2531def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2532def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2533
2534// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
2535//        The content of the other 16-bit half is undefined.
2536// GFX12: The op_sel bit must be 0.
2537def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
2538def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
2539}
2540
2541//===----------------------------------------------------------------------===//
2542// GFX12 Intrinsics
2543//===----------------------------------------------------------------------===//
2544
2545// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
2546def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
2547  Intrinsic<[llvm_i32_ty],
2548            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2549            [IntrNoMem, IntrConvergent, IntrWillReturn,
2550             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2551
2552// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control>
2553def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">,
2554  Intrinsic<[llvm_i32_ty],
2555            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2556            [IntrNoMem, IntrConvergent, IntrWillReturn,
2557             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2558
2559// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
2560//
2561// These operations perform a sparse matrix multiplication and accumulation of
2562// the form: D = A * B + C.
2563// A is sparse matrix, half the size of B, and is expanded using sparsity index.
2564
2565class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2566  Intrinsic<
2567    [CD],               // %D
2568    [
2569      A,                // %A
2570      B,                // %B
2571      LLVMMatchType<0>, // %C
2572      Index             // %Sparsity index for A
2573    ],
2574    [IntrNoMem, IntrConvergent, IntrWillReturn]
2575>;
2576
2577class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2578  Intrinsic<
2579    [CD],               // %D
2580    [
2581      llvm_i1_ty,       // %A_sign
2582      A,                // %A
2583      llvm_i1_ty,       // %B_sign
2584      B,                // %B
2585      LLVMMatchType<0>, // %C
2586      Index,            // %Sparsity index for A
2587      llvm_i1_ty,       // %clamp
2588    ],
2589    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
2590>;
2591
2592defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = {
2593// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2594//
2595// These operations perform a matrix multiplication and accumulation of
2596// the form: D = A * B + C .
2597
2598// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
2599def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2600def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2601def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2602def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2603// A and B are <16 x iu4>.
2604def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2605
2606def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2607def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2608def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2609def int_amdgcn_swmmac_bf16_16x16x32_bf16   : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2610def int_amdgcn_swmmac_i32_16x16x32_iu8     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2611def int_amdgcn_swmmac_i32_16x16x32_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2612def int_amdgcn_swmmac_i32_16x16x64_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2613def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2614def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2615def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2616def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2617}
2618
2619def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
2620
2621def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2622def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2623def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2624def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2625
2626def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
2627
2628class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
2629  Intrinsic<
2630    [llvm_any_ty],
2631    [ptr_ty],
2632    [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
2633    "",
2634    [SDNPMemOperand]
2635  >;
2636
2637// Wave32
2638// <2 x i32>    @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))  -> global_load_tr_b64
2639// <8 x i16>    @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))  -> global_load_tr_b128
2640// Wave64
2641// i32          @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1))    -> global_load_tr_b64
2642// <4 x i16>    @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))  -> global_load_tr_b128
2643
2644def int_amdgcn_global_load_tr_b64  : AMDGPULoadIntrinsic<global_ptr_ty>;
2645def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
2646
2647// i32 @llvm.amdgcn.wave.id()
2648def int_amdgcn_wave_id :
2649  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
2650
2651//===----------------------------------------------------------------------===//
2652// Deep learning intrinsics.
2653//===----------------------------------------------------------------------===//
2654
2655// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
2656//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2657def int_amdgcn_fdot2 :
2658  ClangBuiltin<"__builtin_amdgcn_fdot2">,
2659  DefaultAttrsIntrinsic<
2660    [llvm_float_ty], // %r
2661    [
2662      llvm_v2f16_ty, // %a
2663      llvm_v2f16_ty, // %b
2664      llvm_float_ty, // %c
2665      llvm_i1_ty     // %clamp
2666    ],
2667    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2668  >;
2669
2670// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
2671//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2672def int_amdgcn_fdot2_f16_f16 :
2673  ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
2674  DefaultAttrsIntrinsic<
2675    [llvm_half_ty],  // %r
2676    [
2677      llvm_v2f16_ty, // %a
2678      llvm_v2f16_ty, // %b
2679      llvm_half_ty   // %c
2680    ],
2681    [IntrNoMem, IntrSpeculatable]
2682  >;
2683
2684// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
2685//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2686def int_amdgcn_fdot2_bf16_bf16 :
2687  ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
2688  DefaultAttrsIntrinsic<
2689    [llvm_bfloat_ty],   // %r
2690    [
2691      llvm_v2bf16_ty, // %a
2692      llvm_v2bf16_ty, // %b
2693      llvm_bfloat_ty    // %c
2694    ],
2695    [IntrNoMem, IntrSpeculatable]
2696  >;
2697
2698// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
2699//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2700def int_amdgcn_fdot2_f32_bf16 :
2701  ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
2702  DefaultAttrsIntrinsic<
2703    [llvm_float_ty], // %r
2704    [
2705      llvm_v2bf16_ty, // %a
2706      llvm_v2bf16_ty, // %b
2707      llvm_float_ty, // %c
2708      llvm_i1_ty     // %clamp
2709    ],
2710    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2711  >;
2712
2713// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
2714//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2715def int_amdgcn_sdot2 :
2716  ClangBuiltin<"__builtin_amdgcn_sdot2">,
2717  DefaultAttrsIntrinsic<
2718    [llvm_i32_ty], // %r
2719    [
2720      llvm_v2i16_ty, // %a
2721      llvm_v2i16_ty, // %b
2722      llvm_i32_ty,   // %c
2723      llvm_i1_ty     // %clamp
2724    ],
2725    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2726  >;
2727
2728// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
2729//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2730def int_amdgcn_udot2 :
2731  ClangBuiltin<"__builtin_amdgcn_udot2">,
2732  DefaultAttrsIntrinsic<
2733    [llvm_i32_ty], // %r
2734    [
2735      llvm_v2i16_ty, // %a
2736      llvm_v2i16_ty, // %b
2737      llvm_i32_ty,   // %c
2738      llvm_i1_ty     // %clamp
2739    ],
2740    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2741  >;
2742
2743// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2744//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2745def int_amdgcn_sdot4 :
2746  ClangBuiltin<"__builtin_amdgcn_sdot4">,
2747  DefaultAttrsIntrinsic<
2748    [llvm_i32_ty], // %r
2749    [
2750      llvm_i32_ty, // %a
2751      llvm_i32_ty, // %b
2752      llvm_i32_ty, // %c
2753      llvm_i1_ty   // %clamp
2754    ],
2755    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2756  >;
2757
2758// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
2759//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2760def int_amdgcn_udot4 :
2761  ClangBuiltin<"__builtin_amdgcn_udot4">,
2762  DefaultAttrsIntrinsic<
2763    [llvm_i32_ty], // %r
2764    [
2765      llvm_i32_ty, // %a
2766      llvm_i32_ty, // %b
2767      llvm_i32_ty, // %c
2768      llvm_i1_ty   // %clamp
2769    ],
2770    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2771  >;
2772
2773// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2774// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2775// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
2776// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
2777//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2778def int_amdgcn_sudot4 :
2779  ClangBuiltin<"__builtin_amdgcn_sudot4">,
2780  DefaultAttrsIntrinsic<
2781    [llvm_i32_ty], // %r
2782    [
2783      llvm_i1_ty,  // %a_sign
2784      llvm_i32_ty, // %a
2785      llvm_i1_ty,  // %b_sign
2786      llvm_i32_ty, // %b
2787      llvm_i32_ty, // %c
2788      llvm_i1_ty   // %clamp
2789    ],
2790    [IntrNoMem, IntrSpeculatable,
2791     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2792  >;
2793
2794// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2795//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2796//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2797def int_amdgcn_sdot8 :
2798  ClangBuiltin<"__builtin_amdgcn_sdot8">,
2799  DefaultAttrsIntrinsic<
2800    [llvm_i32_ty], // %r
2801    [
2802      llvm_i32_ty, // %a
2803      llvm_i32_ty, // %b
2804      llvm_i32_ty, // %c
2805      llvm_i1_ty   // %clamp
2806    ],
2807    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2808  >;
2809
2810// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
2811//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2812//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2813def int_amdgcn_udot8 :
2814  ClangBuiltin<"__builtin_amdgcn_udot8">,
2815  DefaultAttrsIntrinsic<
2816    [llvm_i32_ty], // %r
2817    [
2818      llvm_i32_ty, // %a
2819      llvm_i32_ty, // %b
2820      llvm_i32_ty, // %c
2821      llvm_i1_ty   // %clamp
2822    ],
2823    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2824  >;
2825
2826// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2827// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2828// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
2829// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
2830//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2831//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2832  def int_amdgcn_sudot8 :
2833  ClangBuiltin<"__builtin_amdgcn_sudot8">,
2834  DefaultAttrsIntrinsic<
2835    [llvm_i32_ty], // %r
2836    [
2837      llvm_i1_ty,  // %a_sign
2838      llvm_i32_ty, // %a
2839      llvm_i1_ty,  // %b_sign
2840      llvm_i32_ty, // %b
2841      llvm_i32_ty, // %c
2842      llvm_i1_ty   // %clamp
2843    ],
2844    [IntrNoMem, IntrSpeculatable,
2845     ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2846  >;
2847
2848// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c)
2849//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2850class AMDGPU8bitFloatDot4Intrinsic :
2851  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2852  DefaultAttrsIntrinsic<
2853    [llvm_float_ty], // %r
2854    [
2855      llvm_i32_ty,   // %a
2856      llvm_i32_ty,   // %b
2857      llvm_float_ty, // %c
2858    ],
2859    [IntrNoMem, IntrSpeculatable]
2860  >;
2861
2862def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
2863def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
2864def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
2865def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
2866
2867//===----------------------------------------------------------------------===//
2868// gfx908 intrinsics
2869// ===----------------------------------------------------------------------===//
2870
2871def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2872
2873// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
2874class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
2875  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2876  DefaultAttrsIntrinsic<[DestTy],
2877            [SrcABTy, SrcABTy, DestTy,
2878             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2879            [IntrConvergent, IntrNoMem,
2880             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2881
2882defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
2883def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
2884def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
2885def int_amdgcn_mfma_f32_4x4x1f32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
2886def int_amdgcn_mfma_f32_32x32x2f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
2887def int_amdgcn_mfma_f32_16x16x4f32  : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
2888def int_amdgcn_mfma_f32_32x32x4f16  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>;
2889def int_amdgcn_mfma_f32_16x16x4f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
2890def int_amdgcn_mfma_f32_4x4x4f16    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
2891def int_amdgcn_mfma_f32_32x32x8f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
2892def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
2893def int_amdgcn_mfma_i32_32x32x4i8   : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>;
2894def int_amdgcn_mfma_i32_16x16x4i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
2895def int_amdgcn_mfma_i32_4x4x4i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
2896def int_amdgcn_mfma_i32_32x32x8i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
2897def int_amdgcn_mfma_i32_16x16x16i8  : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
2898def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
2899def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
2900def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
2901def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
2902def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
2903}
2904
2905//===----------------------------------------------------------------------===//
2906// gfx90a intrinsics
2907// ===----------------------------------------------------------------------===//
2908
2909def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2910def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2911def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2912def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2913def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2914
2915defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
2916def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
2917def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
2918def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
2919def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
2920def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
2921
2922// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
2923//       Three bits corresponding to the neg modifier applied to the respective
2924//       source operand.
2925def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
2926def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
2927}
2928
2929//===----------------------------------------------------------------------===//
2930// gfx940 intrinsics
2931// ===----------------------------------------------------------------------===//
2932
2933class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
2934  AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
2935
2936multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
2937  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
2938    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
2939}
2940
2941// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
2942class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
2943  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2944  DefaultAttrsIntrinsic<[DestTy],
2945            [SrcA, SrcB, DestTy, llvm_i32_ty,
2946             llvm_i32_ty, llvm_i32_ty],
2947            [IntrConvergent, IntrNoMem,
2948             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2949
2950class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
2951  AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
2952
2953multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
2954  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
2955    def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
2956}
2957
2958// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
2959def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
2960def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
2961
2962defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
2963def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
2964def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
2965def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
2966def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
2967
2968defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
2969defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
2970
2971def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
2972def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
2973def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
2974def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
2975def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
2976def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
2977
2978defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
2979defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
2980}
2981
2982// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
2983// byte_sel selects byte from srcA.
2984def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
2985  DefaultAttrsIntrinsic<[llvm_float_ty],
2986            [llvm_i32_ty, llvm_i32_ty],
2987            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
2988
2989// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3]
2990def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
2991  DefaultAttrsIntrinsic<[llvm_float_ty],
2992            [llvm_i32_ty, llvm_i32_ty],
2993            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
2994
2995// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
2996// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
2997def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
2998  DefaultAttrsIntrinsic<[llvm_v2f32_ty],
2999            [llvm_i32_ty, llvm_i1_ty],
3000            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
3001
3002// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
3003def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
3004  DefaultAttrsIntrinsic<[llvm_v2f32_ty],
3005            [llvm_i32_ty, llvm_i1_ty],
3006            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
3007
3008// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
3009// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
3010def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
3011  DefaultAttrsIntrinsic<[llvm_i32_ty],
3012            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
3013            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3014
3015// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
3016def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
3017  DefaultAttrsIntrinsic<[llvm_i32_ty],
3018            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
3019            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3020
3021// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
3022// byte_sel selects byte to write into vdst.
3023def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
3024  DefaultAttrsIntrinsic<[llvm_i32_ty],
3025            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3026            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3027
3028// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
3029def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
3030  DefaultAttrsIntrinsic<[llvm_i32_ty],
3031            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3032            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3033
3034//===----------------------------------------------------------------------===//
3035// Special Intrinsics for backend internal use only. No frontend
3036// should emit calls to these.
3037// ===----------------------------------------------------------------------===//
3038//
3039// Control-flow intrinsics in LLVM IR are convergent because they represent the
3040// wave CFG, i.e., sets of threads that are "converged" or "execute in
3041// lock-step". But they exist during a small window in the lowering process,
3042// inserted after the structurizer and then translated to equivalent MIR
3043// pseudos. So rather than create convergence tokens for these builtins, we
3044// simply mark them as not convergent.
3045//
3046// This is really a workaround to allow control flow lowering in the presence of
3047// convergence control tokens. The corresponding MIR pseudos are marked as
3048// having side effects, which is sufficient to prevent optimizations without
3049// having to mark them as convergent.
3050def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
3051  [llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
3052>;
3053
3054def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
3055  [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
3056>;
3057
3058def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
3059  [llvm_i1_ty, LLVMMatchType<0>],
3060  [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]
3061>;
3062
3063def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
3064  [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
3065>;
3066
3067def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
3068  [IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3069
3070// Represent unreachable in a divergent region.
3071def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;
3072
3073// Emit 2.5 ulp, no denormal division. Should only be inserted by
3074// pass based on !fpmath metadata.
3075def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
3076  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
3077  [IntrNoMem, IntrSpeculatable]
3078>;
3079
3080/// Emit an addrspacecast without null pointer checking.
3081/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
3082def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
3083  [llvm_anyptr_ty], [llvm_anyptr_ty],
3084  [IntrNoMem, IntrSpeculatable]
3085>;
3086}
3087