1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13def global_ptr_ty : LLVMQualPointerType<1>; 14 15// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred 16// by the backend cause whole-program undefined behavior when violated, such as 17// by causing all other preload register intrinsics to return arbitrarily incorrect 18// values. In non-entry-point functions, attempting to call a function that needs 19// some preloaded register from a function that is known to not need it is a violation 20// of the calling convention and also program-level UB. Outside of such IR-level UB, 21// these preloaded registers are always set to a well-defined value and are thus `noundef`. 22class AMDGPUReadPreloadRegisterIntrinsic 23 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 24 25class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 26 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; 27 28// Used to tag image and resource intrinsics with information used to generate 29// mem operands. 30class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 31 int RsrcArg = rsrcarg; 32 bit IsImage = isimage; 33} 34 35let TargetPrefix = "r600" in { 36 37multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 38 def _x : AMDGPUReadPreloadRegisterIntrinsic; 39 def _y : AMDGPUReadPreloadRegisterIntrinsic; 40 def _z : AMDGPUReadPreloadRegisterIntrinsic; 41} 42 43multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 44 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 45 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 46 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 47} 48 49defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 50 <"__builtin_r600_read_global_size">; 51defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 52 <"__builtin_r600_read_ngroups">; 53defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 54 <"__builtin_r600_read_tgid">; 55 56defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 57defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 58 59def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 60 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 61 62// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 63def int_r600_implicitarg_ptr : 64 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 65 DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [], 66 [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 67 68def int_r600_rat_store_typed : 69 // 1st parameter: Data 70 // 2nd parameter: Index 71 // 3rd parameter: Constant RAT ID 72 DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, 73 ClangBuiltin<"__builtin_r600_rat_store_typed">; 74 75def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< 76 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 77>; 78 79def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< 80 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 81>; 82 83def int_r600_cube : DefaultAttrsIntrinsic< 84 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 85>; 86 87def int_r600_store_stream_output : DefaultAttrsIntrinsic< 88 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] 89>; 90 91class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ 92 llvm_v4f32_ty, // Coord 93 llvm_i32_ty, // offset_x 94 llvm_i32_ty, // offset_y, 95 llvm_i32_ty, // offset_z, 96 llvm_i32_ty, // resource_id 97 llvm_i32_ty, // samplerid 98 llvm_i32_ty, // coord_type_x 99 llvm_i32_ty, // coord_type_y 100 llvm_i32_ty, // coord_type_z 101 llvm_i32_ty], // coord_type_w 102 [IntrNoMem] 103>; 104 105class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ 106 llvm_v4i32_ty, // Coord 107 llvm_i32_ty, // offset_x 108 llvm_i32_ty, // offset_y, 109 llvm_i32_ty, // offset_z, 110 llvm_i32_ty, // resource_id 111 llvm_i32_ty, // samplerid 112 llvm_i32_ty, // coord_type_x 113 llvm_i32_ty, // coord_type_y 114 llvm_i32_ty, // coord_type_z 115 llvm_i32_ty], // coord_type_w 116 [IntrNoMem] 117>; 118 119def int_r600_store_swizzle : 120 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 121>; 122 123def int_r600_tex : TextureIntrinsicFloatInput; 124def int_r600_texc : TextureIntrinsicFloatInput; 125def int_r600_txl : TextureIntrinsicFloatInput; 126def int_r600_txlc : TextureIntrinsicFloatInput; 127def int_r600_txb : TextureIntrinsicFloatInput; 128def int_r600_txbc : TextureIntrinsicFloatInput; 129def int_r600_txf : TextureIntrinsicInt32Input; 130def int_r600_txq : TextureIntrinsicInt32Input; 131def int_r600_ddx : TextureIntrinsicFloatInput; 132def int_r600_ddy : TextureIntrinsicFloatInput; 133 134def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], 135 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 136>; 137 138def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; 139 140} // End TargetPrefix = "r600" 141 142let TargetPrefix = "amdgcn" in { 143 144//===----------------------------------------------------------------------===// 145// ABI Special Intrinsics 146//===----------------------------------------------------------------------===// 147 148defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 149defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 150 <"__builtin_amdgcn_workgroup_id">; 151 152def int_amdgcn_dispatch_ptr : 153 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 154 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>; 155 156def int_amdgcn_queue_ptr : 157 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 158 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 159 [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>; 160 161def int_amdgcn_kernarg_segment_ptr : 162 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 163 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 164 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 165 166def int_amdgcn_implicitarg_ptr : 167 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 168 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 169 [Align<RetIndex, 4>, NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 170 171// Returns the amount of LDS statically allocated for this program. 172// This is no longer guaranteed to be a compile-time constant due to linking 173// support. 174def int_amdgcn_groupstaticsize : 175 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 176 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 177 178def int_amdgcn_dispatch_id : 179 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 180 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 181 182// For internal use. Coordinates LDS lowering between IR transform and backend. 183def int_amdgcn_lds_kernel_id : 184 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 185 186def int_amdgcn_implicit_buffer_ptr : 187 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 188 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 189 [Align<RetIndex, 4>, NoUndef<RetIndex>, 190 IntrNoMem, IntrSpeculatable]>; 191 192// Set EXEC to the 64-bit value given. 193// This is always moved to the beginning of the basic block. 194// FIXME: Should be mangled for wave size. 195def int_amdgcn_init_exec : Intrinsic<[], 196 [llvm_i64_ty], // 64-bit literal constant 197 [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, 198 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; 199 200// Set EXEC according to a thread count packed in an SGPR input: 201// thread_count = (input >> bitoffset) & 0x7f; 202// This is always moved to the beginning of the basic block. 203// Note: only inreg arguments to the parent function are valid as 204// inputs to this intrinsic, computed values cannot be used. 205def int_amdgcn_init_exec_from_input : Intrinsic<[], 206 [llvm_i32_ty, // 32-bit SGPR input 207 llvm_i32_ty], // bit offset of the thread count 208 [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, 209 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 210 211def int_amdgcn_wavefrontsize : 212 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 213 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 214 215// Represent a relocation constant. 216def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< 217 [llvm_i32_ty], [llvm_metadata_ty], 218 [IntrNoMem, IntrSpeculatable] 219>; 220 221//===----------------------------------------------------------------------===// 222// Instruction Intrinsics 223//===----------------------------------------------------------------------===// 224 225// The first parameter is s_sendmsg immediate (i16), 226// the second one is copied to m0 227def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 228 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 229 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 230def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 231 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 232 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 233 234 235// gfx11 intrinsic 236// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 237def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 238 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 239 240def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 241 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 242 243def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, 244 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 245 IntrNoCallback, IntrNoFree]>; 246 247def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, 248 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 249 IntrNoCallback, IntrNoFree]>; 250 251def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, 252 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 253 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 254 255def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, 256 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 257 IntrNoCallback, IntrNoFree]>; 258 259def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, 260 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, 261 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 262 263def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, 264 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 265 IntrNoCallback, IntrNoFree]>; 266 267def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, 268 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 269 IntrNoCallback, IntrNoFree]>; 270 271def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, 272 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 273 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 274 275def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, 276 Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 277 278def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, 279 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 280 IntrNoCallback, IntrNoFree]>; 281 282def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 283 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 284 285// The 1st parameter is a mask for the types of instructions that may be allowed 286// to cross the SCHED_BARRIER during scheduling. 287// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 288// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 289// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 290// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 291// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 292// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. 293// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 294// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 295// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 296// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 297// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 298// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 299def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 300 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 301 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 302 303// The first parameter is a mask that determines the types of instructions that 304// you would like to synchronize around and add to a scheduling group. The 305// values of the mask are defined above for sched_barrier. These instructions 306// will be selected from the bottom up starting from the sched_group_barrier's 307// location during instruction scheduling. The second parameter is the number of 308// matching instructions that will be associated with this sched_group_barrier. 309// The third parameter is an identifier which is used to describe what other 310// sched_group_barriers should be synchronized with. 311def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, 312 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 313 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, 314 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 315 316// Scheduler optimization hint. 317// MASK = 0: Small gemm opt 318def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, 319 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 320 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 321 322def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 323 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 324 325// GFX12 intrinsics 326class AMDGPUWaitIntrinsic : 327 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 328def int_amdgcn_s_wait_bvhcnt : AMDGPUWaitIntrinsic; 329def int_amdgcn_s_wait_dscnt : AMDGPUWaitIntrinsic; 330def int_amdgcn_s_wait_expcnt : AMDGPUWaitIntrinsic; 331def int_amdgcn_s_wait_kmcnt : AMDGPUWaitIntrinsic; 332def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic; 333def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic; 334def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic; 335 336def int_amdgcn_div_scale : DefaultAttrsIntrinsic< 337 // 1st parameter: Numerator 338 // 2nd parameter: Denominator 339 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 340 // (0 = Denominator, 1 = Numerator). 341 [llvm_anyfloat_ty, llvm_i1_ty], 342 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 343 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 344>; 345 346def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 347 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 348 [IntrNoMem, IntrSpeculatable] 349>; 350 351def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 352 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 353 [IntrNoMem, IntrSpeculatable] 354>; 355 356// Look Up 2.0 / pi src0 with segment select src1[4:0] 357def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< 358 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 359 [IntrNoMem, IntrSpeculatable] 360>; 361 362def int_amdgcn_sin : DefaultAttrsIntrinsic< 363 [llvm_anyfloat_ty], [LLVMMatchType<0>], 364 [IntrNoMem, IntrSpeculatable] 365>; 366 367def int_amdgcn_cos : DefaultAttrsIntrinsic< 368 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 369>; 370 371// v_log_{f16|f32}, performs log2. f32 version does not handle 372// denormals. There is no reason to use this for f16 as it does 373// support denormals, and the generic log2 intrinsic should be 374// preferred. 375def int_amdgcn_log : DefaultAttrsIntrinsic< 376 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 377>; 378 379// v_exp_{f16|f32} (int_amdgcn_exp was taken by export 380// already). Performs exp2. f32 version does not handle 381// denormals. There is no reason to use this for f16 as it does 382// support denormals, and the generic exp2 intrinsic should be 383// preferred. 384def int_amdgcn_exp2 : DefaultAttrsIntrinsic< 385 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 386>; 387 388def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< 389 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 390>; 391 392def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 393 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 394 [IntrNoMem, IntrSpeculatable, Commutative] 395>; 396 397// Fused single-precision multiply-add with legacy behaviour for the multiply, 398// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 399// intended for use on subtargets that have the v_fma_legacy_f32 and/or 400// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 401// has a completely different kind of legacy behaviour.) 402def int_amdgcn_fma_legacy : 403 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 404 [IntrNoMem, IntrSpeculatable, Commutative] 405>; 406 407def int_amdgcn_rcp : DefaultAttrsIntrinsic< 408 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 409>; 410 411def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 412 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], 413 [IntrNoMem, IntrSpeculatable] 414>; 415 416def int_amdgcn_sqrt : DefaultAttrsIntrinsic< 417 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 418>; 419 420def int_amdgcn_rsq : DefaultAttrsIntrinsic< 421 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 422>; 423 424def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 425 DefaultAttrsIntrinsic< 426 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] 427>; 428 429// out = 1.0 / sqrt(a) result clamped to +/- max_float. 430def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< 431 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; 432 433def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< 434 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 435>; 436 437def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< 438 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] 439>; 440 441// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 442// and always uses rtz, so is not suitable for implementing the OpenCL 443// fract function. It should be ok on VI. 444def int_amdgcn_fract : DefaultAttrsIntrinsic< 445 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 446>; 447 448def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 449 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 450 [IntrNoMem, IntrSpeculatable] 451>; 452 453def int_amdgcn_cvt_pknorm_i16 : 454 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 455 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 456 [IntrNoMem, IntrSpeculatable] 457>; 458 459def int_amdgcn_cvt_pknorm_u16 : 460 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 461 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 462 [IntrNoMem, IntrSpeculatable] 463>; 464 465def int_amdgcn_cvt_pk_i16 : 466 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 467 DefaultAttrsIntrinsic< 468 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 469 [IntrNoMem, IntrSpeculatable] 470>; 471 472def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 473 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 474 [IntrNoMem, IntrSpeculatable] 475>; 476 477def int_amdgcn_class : DefaultAttrsIntrinsic< 478 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 479 [IntrNoMem, IntrSpeculatable] 480>; 481 482def int_amdgcn_fmed3 : 483 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 484 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 485 [IntrNoMem, IntrSpeculatable] 486>; 487 488def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 489 DefaultAttrsIntrinsic<[llvm_float_ty], 490 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 491 [IntrNoMem, IntrSpeculatable] 492>; 493 494def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 495 DefaultAttrsIntrinsic<[llvm_float_ty], 496 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 497 [IntrNoMem, IntrSpeculatable] 498>; 499 500def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 501 DefaultAttrsIntrinsic<[llvm_float_ty], 502 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 503 [IntrNoMem, IntrSpeculatable] 504>; 505 506def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 507 DefaultAttrsIntrinsic<[llvm_float_ty], 508 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 509 [IntrNoMem, IntrSpeculatable] 510>; 511 512// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 513// should be used. 514def int_amdgcn_sffbh : 515 DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 516 [IntrNoMem, IntrSpeculatable] 517>; 518 519// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 520def int_amdgcn_fmad_ftz : 521 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 522 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 523 [IntrNoMem, IntrSpeculatable] 524>; 525 526// FIXME: The m0 argument should be moved after the normal arguments 527class AMDGPUDSOrderedIntrinsic : Intrinsic< 528 [llvm_i32_ty], 529 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 530 // the bit packing can be optimized at the IR level. 531 [LLVMQualPointerType<2>, // IntToPtr(M0) 532 llvm_i32_ty, // value to add or swap 533 llvm_i32_ty, // ordering 534 llvm_i32_ty, // scope 535 llvm_i1_ty, // isVolatile 536 llvm_i32_ty, // ordered count index (OA index), also added to the address 537 // gfx10: bits 24-27 indicate the number of active threads/dwords 538 llvm_i1_ty, // wave release, usually set to 1 539 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 540 [IntrWillReturn, NoCapture<ArgIndex<0>>, 541 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 542 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree 543 ] 544>; 545 546class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 547 [llvm_i32_ty], 548 [llvm_anyptr_ty, // LDS or GDS ptr 549 llvm_i1_ty], // isVolatile 550 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 551 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], 552 "", 553 [SDNPMemOperand] 554>; 555 556def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 557def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 558 559// The pointer argument is assumed to be dynamically uniform if a VGPR. 560def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 561def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 562 563} // TargetPrefix = "amdgcn" 564 565// New-style image intrinsics 566 567////////////////////////////////////////////////////////////////////////// 568// Dimension-aware image intrinsics framework 569////////////////////////////////////////////////////////////////////////// 570 571// Helper class to represent (type, name) combinations of arguments. The 572// argument names are explanatory and used as DAG operand names for codegen 573// pattern matching. 574class AMDGPUArg<LLVMType ty, string name> { 575 LLVMType Type = ty; 576 string Name = name; 577} 578 579// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 580class makeArgList<list<string> names, LLVMType basety> { 581 list<AMDGPUArg> ret = 582 !listconcat([AMDGPUArg<basety, names[0]>], 583 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 584} 585 586// Return arglist, with LLVMMatchType's references shifted by 'shift'. 587class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 588 list<AMDGPUArg> ret = 589 !foreach(arg, arglist, 590 !if(!isa<LLVMMatchType>(arg.Type), 591 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 592 arg.Name>, 593 arg)); 594} 595 596// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 597// accordingly, and shifted by an additional 'shift'. 598class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 599 list<AMDGPUArg> ret = 600 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 601 !listconcat( 602 lhs, 603 arglistmatchshift<rhs, 604 !add(shift, !foldl(0, lhs, a, b, 605 !add(a, b.Type.isAny)))>.ret)); 606} 607 608// Represent texture/image types / dimensionality. 609class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 610 list<string> coord_names, list<string> slice_names, 611 bit msaa = 0> { 612 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 613 string Name = name; // e.g. "2darraymsaa" 614 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 615 bits<3> Encoding = enc; 616 bit DA = 0; // DA bit in MIMG encoding 617 bit MSAA = msaa; 618 619 list<AMDGPUArg> CoordSliceArgs = 620 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 621 list<AMDGPUArg> CoordSliceIntArgs = 622 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 623 list<AMDGPUArg> GradientArgs = 624 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 625 !foreach(name, coord_names, "d" # name # "dv")), 626 llvm_anyfloat_ty>.ret; 627 628 bits<8> NumCoords = !size(CoordSliceArgs); 629 bits<8> NumGradients = !size(GradientArgs); 630} 631 632def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 633def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 634def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 635let DA = 1 in { 636 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 637 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 638 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 639} 640def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 641let DA = 1 in { 642 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 643} 644 645def AMDGPUDims { 646 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 647 AMDGPUDimCube, AMDGPUDim1DArray, 648 AMDGPUDim2DArray]; 649 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 650 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 651} 652 653// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 654class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 655 string UpperCaseMod = ucmod; 656 string LowerCaseMod = lcmod; 657 658 // {offset} {bias} {z-compare} 659 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 660 bit Offset = false; 661 bit Bias = false; 662 bit ZCompare = false; 663 bit Gradients = false; 664 665 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 666 // if any. 667 string LodOrClamp = ""; 668} 669 670// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 671// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 672defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 673 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 674 list<AMDGPUArg> extra_addr> { 675 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 676 let Offset = true in 677 def NAME#lcmod#_o : AMDGPUSampleVariant< 678 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 679 } 680 681 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 682 list<AMDGPUArg> extra_addr> { 683 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 684 let ZCompare = true in 685 defm NAME : AMDGPUSampleHelper_Offset< 686 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 687 } 688 689 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 690 list<AMDGPUArg> extra_addr> { 691 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 692 let LodOrClamp = "clamp" in 693 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 694 } 695 696 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 697 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 698 let Bias = true in 699 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 700 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 701 let LodOrClamp = "lod" in 702 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 703 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 704 } 705 706 let Gradients = true in { 707 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 708 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 709 } 710} 711 712// Helper class to capture the profile of a dimension-aware image intrinsic. 713// This information is used to generate the intrinsic's type and to inform 714// codegen pattern matching. 715class AMDGPUDimProfile<string opmod, 716 AMDGPUDimProps dim> { 717 AMDGPUDimProps Dim = dim; 718 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 719 720 // These are intended to be overwritten by subclasses 721 bit IsSample = false; 722 bit IsAtomic = false; 723 list<LLVMType> RetTypes = []; 724 list<AMDGPUArg> DataArgs = []; 725 list<AMDGPUArg> ExtraAddrArgs = []; 726 bit Offset = false; 727 bit Bias = false; 728 bit ZCompare = false; 729 bit Gradients = false; 730 string LodClampMip = ""; 731 732 int NumRetAndDataAnyTypes = 733 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 734 !add(a, b.isAny)); 735 736 list<AMDGPUArg> AddrArgs = 737 arglistconcat<[ExtraAddrArgs, 738 !if(Gradients, dim.GradientArgs, []), 739 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 740 !if(!empty(LodClampMip), 741 []<AMDGPUArg>, 742 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 743 NumRetAndDataAnyTypes>.ret; 744 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 745 list<AMDGPUArg> AddrDefaultArgs = 746 !foreach(arg, AddrArgs, 747 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 748 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 749 arg.Name>); 750 list<AMDGPUArg> AddrA16Args = 751 !foreach(arg, AddrArgs, 752 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 753 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 754 arg.Name>); 755} 756 757class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 758 let IsSample = base.IsSample; 759 let IsAtomic = base.IsAtomic; 760 let RetTypes = base.RetTypes; 761 let DataArgs = base.DataArgs; 762 let ExtraAddrArgs = base.ExtraAddrArgs; 763 let Offset = base.Offset; 764 let Bias = base.Bias; 765 let ZCompare = base.ZCompare; 766 let Gradients = base.Gradients; 767 let LodClampMip = base.LodClampMip; 768} 769 770class AMDGPUDimSampleProfile<string opmod, 771 AMDGPUDimProps dim, 772 AMDGPUSampleVariant sample, 773 bit has_return = true> : AMDGPUDimProfile<opmod, dim> { 774 let IsSample = true; 775 let RetTypes = !if(has_return, [llvm_any_ty], []); 776 let ExtraAddrArgs = sample.ExtraAddrArgs; 777 let Offset = sample.Offset; 778 let Bias = sample.Bias; 779 let ZCompare = sample.ZCompare; 780 let Gradients = sample.Gradients; 781 let LodClampMip = sample.LodOrClamp; 782} 783 784class AMDGPUDimSampleNoReturnProfile<string opmod, 785 AMDGPUDimProps dim, 786 AMDGPUSampleVariant sample> 787 : AMDGPUDimSampleProfile<opmod, dim, sample, false> { 788} 789 790class AMDGPUDimNoSampleProfile<string opmod, 791 AMDGPUDimProps dim, 792 list<LLVMType> retty, 793 list<AMDGPUArg> dataargs, 794 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 795 let RetTypes = retty; 796 let DataArgs = dataargs; 797 let LodClampMip = !if(Mip, "mip", ""); 798} 799 800class AMDGPUDimAtomicProfile<string opmod, 801 AMDGPUDimProps dim, 802 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 803 let RetTypes = [llvm_anyint_ty]; 804 let DataArgs = dataargs; 805 let IsAtomic = true; 806} 807 808class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 809 list<AMDGPUArg> dataargs> 810 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 811 let RetTypes = [llvm_anyfloat_ty]; 812} 813 814class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 815 : AMDGPUDimProfile<"GET_RESINFO", dim> { 816 let RetTypes = [llvm_anyfloat_ty]; 817 let DataArgs = []; 818 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 819 let LodClampMip = "mip"; 820} 821 822// Helper class for figuring out image intrinsic argument indexes. 823class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 824 int NumDataArgs = !size(P_.DataArgs); 825 int NumDmaskArgs = !not(P_.IsAtomic); 826 int NumOffsetArgs = !if(P_.Offset, 1, 0); 827 int NumBiasArgs = !if(P_.Bias, 1, 0); 828 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 829 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 830 int NumVAddrArgs = !size(P_.AddrArgs); 831 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 832 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 833 int NumRSrcArgs = 1; 834 int NumSampArgs = !if(P_.IsSample, 2, 0); 835 int DmaskArgIndex = NumDataArgs; 836 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 837 int OffsetArgIndex = VAddrArgIndex; 838 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 839 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 840 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 841 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 842 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 843 int MipArgIndex = LodArgIndex; 844 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 845 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 846 int UnormArgIndex = !add(SampArgIndex, 1); 847 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 848 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 849} 850 851// All dimension-aware intrinsics are derived from this class. 852class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 853 list<IntrinsicProperty> props, 854 list<SDNodeProperty> sdnodeprops> : Intrinsic< 855 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 856 !listconcat( 857 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 858 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 859 P_.AddrTypes, // vaddr(VGPR) 860 [llvm_v8i32_ty], // rsrc(SGPR) 861 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 862 llvm_i1_ty], []), // unorm(imm) 863 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 864 llvm_i32_ty]), // auxiliary/cachepolicy(imm): 865 // bit 0 = glc, bit 1 = slc, 866 // bit 2 = dlc (gfx10/gfx11), 867 // bit 4 = scc (gfx90a) 868 // gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1 869 // gfx12+: bits [0-2] = th, bits [3-4] = scope 870 !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn], 871 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 872 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 873 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 874 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>], 875 !if(P_.IsAtomic, [], [IntrNoSync])), 876 877 878 "", sdnodeprops>, 879 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 880 !if(P_.IsAtomic, 0, 1)), 1> { 881 AMDGPUDimProfile P = P_; 882 883 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 884 885 let TargetPrefix = "amdgcn"; 886} 887 888// Marker class for intrinsics with a DMask that determines the returned 889// channels. 890class AMDGPUImageDMaskIntrinsic; 891 892defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 893 894 ////////////////////////////////////////////////////////////////////////// 895 // Load and store intrinsics 896 ////////////////////////////////////////////////////////////////////////// 897 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 898 list<LLVMType> retty, 899 list<AMDGPUArg> dataargs, 900 list<IntrinsicProperty> props, 901 list<SDNodeProperty> sdnodeprops, 902 bit Mip = false> { 903 foreach dim = AMDGPUDims.NoMsaa in { 904 def !strconcat(NAME, "_", dim.Name) 905 : AMDGPUImageDimIntrinsic< 906 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 907 props, sdnodeprops>; 908 } 909 } 910 911 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 912 list<LLVMType> retty, 913 list<AMDGPUArg> dataargs, 914 list<IntrinsicProperty> props, 915 list<SDNodeProperty> sdnodeprops, 916 bit Mip = false> { 917 foreach dim = AMDGPUDims.All in { 918 def !strconcat(NAME, "_", dim.Name) 919 : AMDGPUImageDimIntrinsic< 920 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 921 props, sdnodeprops>; 922 } 923 } 924 925 defm int_amdgcn_image_load 926 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 927 [SDNPMemOperand]>, 928 AMDGPUImageDMaskIntrinsic; 929 defm int_amdgcn_image_load_mip 930 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 931 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 932 AMDGPUImageDMaskIntrinsic; 933 934 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 935 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 936 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, 937 AMDGPUImageDMaskIntrinsic; 938 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 939 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 940 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, 941 AMDGPUImageDMaskIntrinsic; 942 943 ////////////////////////////////////////////////////////////////////////// 944 // MSAA intrinsics 945 ////////////////////////////////////////////////////////////////////////// 946 foreach dim = AMDGPUDims.Msaa in { 947 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 948 AMDGPUImageDimIntrinsic< 949 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 950 [IntrReadMem], [SDNPMemOperand]>; 951 } 952 953 foreach dim = AMDGPUDims.Msaa in { 954 def int_amdgcn_image_msaa_load # _ # dim.Name: 955 AMDGPUImageDimIntrinsic< 956 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 957 [IntrReadMem], [SDNPMemOperand]>; 958 } 959 960 ////////////////////////////////////////////////////////////////////////// 961 // sample and getlod intrinsics 962 ////////////////////////////////////////////////////////////////////////// 963 multiclass AMDGPUImageDimSampleDims<string opmod, 964 AMDGPUSampleVariant sample, 965 bit NoMem = false> { 966 foreach dim = AMDGPUDims.NoMsaa in { 967 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 968 AMDGPUDimSampleProfile<opmod, dim, sample>, 969 !if(NoMem, [IntrNoMem], [IntrReadMem]), 970 !if(NoMem, [], [SDNPMemOperand])>; 971 } 972 } 973 974 foreach sample = AMDGPUSampleVariants in { 975 defm int_amdgcn_image_sample # sample.LowerCaseMod 976 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 977 AMDGPUImageDMaskIntrinsic; 978 } 979 980 multiclass AMDGPUImageDimSampleNoReturnDims<string opmod, 981 AMDGPUSampleVariant sample> { 982 foreach dim = AMDGPUDims.NoMsaa in { 983 def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic< 984 AMDGPUDimSampleNoReturnProfile<opmod, dim, sample>, 985 [IntrWillReturn], [SDNPMemOperand]>; 986 } 987 } 988 foreach sample = AMDGPUSampleVariants in { 989 defm int_amdgcn_image_sample # sample.LowerCaseMod 990 : AMDGPUImageDimSampleNoReturnDims< 991 "SAMPLE" # sample.UpperCaseMod # "_nortn", sample>, 992 AMDGPUImageDMaskIntrinsic; 993 } 994 995 defm int_amdgcn_image_getlod 996 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 997 AMDGPUImageDMaskIntrinsic; 998 999 ////////////////////////////////////////////////////////////////////////// 1000 // getresinfo intrinsics 1001 ////////////////////////////////////////////////////////////////////////// 1002 foreach dim = AMDGPUDims.All in { 1003 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 1004 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 1005 AMDGPUImageDMaskIntrinsic; 1006 } 1007 1008 ////////////////////////////////////////////////////////////////////////// 1009 // gather4 intrinsics 1010 ////////////////////////////////////////////////////////////////////////// 1011 foreach sample = AMDGPUSampleVariantsNoGradients in { 1012 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 1013 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 1014 AMDGPUImageDimIntrinsic< 1015 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 1016 [IntrReadMem], [SDNPMemOperand]>; 1017 } 1018 } 1019} 1020 1021////////////////////////////////////////////////////////////////////////// 1022// atomic intrinsics 1023////////////////////////////////////////////////////////////////////////// 1024defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 1025 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 1026 int isFloat = 0> { 1027 foreach dim = AMDGPUDims.All in { 1028 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 1029 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 1030 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 1031 [], [SDNPMemOperand]>; 1032 } 1033 } 1034 1035 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 1036 defm "" 1037 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 1038 isFloat>; 1039 } 1040 1041 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 1042 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 1043 } 1044 1045 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 1046 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 1047 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 1048 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 1049 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 1050 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 1051 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 1052 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 1053 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 1054 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 1055 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 1056 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 1057 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 1058 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 1059 defm int_amdgcn_image_atomic_add_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_ADD_FLT">; 1060 defm int_amdgcn_image_atomic_min_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MIN_FLT">; 1061 defm int_amdgcn_image_atomic_max_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MAX_FLT">; 1062 1063 defm int_amdgcn_image_atomic_cmpswap : 1064 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 1065 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 1066 1067 defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">; 1068 defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">; 1069} 1070 1071////////////////////////////////////////////////////////////////////////// 1072// Buffer intrinsics 1073////////////////////////////////////////////////////////////////////////// 1074 1075// Data type for buffer resources (V#). Maybe, in the future, we can create a 1076// similar one for textures (T#). 1077def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>; 1078 1079let TargetPrefix = "amdgcn" in { 1080 1081def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < 1082 [AMDGPUBufferRsrcTy], 1083 [llvm_anyptr_ty, // base 1084 llvm_i16_ty, // stride (and swizzle control) 1085 llvm_i32_ty, // NumRecords / extent 1086 llvm_i32_ty], // flags 1087 // Attributes lifted from ptrmask + some extra argument attributes. 1088 [IntrNoMem, ReadNone<ArgIndex<0>>, 1089 IntrSpeculatable, IntrWillReturn]>; 1090 1091defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 1092 1093// Generate a buffer_load instruction that may be optimized to s_buffer_load if 1094// the offset argument is uniform. 1095def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < 1096 [llvm_any_ty], 1097 [llvm_v4i32_ty, // rsrc(SGPR) 1098 llvm_i32_ty, // byte offset 1099 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1100 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1101 // bit 3 = swz, bit 4 = scc (gfx90a) 1102 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1103 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1104 // bit 6 = swz 1105 // Note: volatile bit is **not** permitted here. 1106 [IntrNoMem, ImmArg<ArgIndex<2>>]>, 1107 AMDGPURsrcIntrinsic<0>; 1108 1109// Buffer intrinsics with separate raw and struct variants. The raw 1110// variant never has an index. The struct variant always has an index, even if 1111// it is const 0. A struct intrinsic with constant 0 index is different to the 1112// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1113// and swizzling changes depending on whether idxen is set in the instruction. 1114// These instrinsics also keep the offset and soffset arguments separate as 1115// they behave differently in bounds checking and swizzling. 1116 1117// The versions of these intrinsics that take <4 x i32> arguments are deprecated 1118// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, 1119// which allow for improved reasoning about memory accesses. 1120// 1121// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved 1122// through to final assembly selection and is used to signal that the buffer 1123// operation is volatile. 1124class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1125 [data_ty], 1126 [llvm_v4i32_ty, // rsrc(SGPR) 1127 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1128 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1129 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1130 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1131 // bit 3 = swz, bit 4 = scc (gfx90a) 1132 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1133 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1134 // bit 6 = swz 1135 // all: volatile op (bit 31, stripped at lowering) 1136 [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1137 AMDGPURsrcIntrinsic<0>; 1138def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1139def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1140 1141class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1142 [data_ty], 1143 [llvm_v4i32_ty, // rsrc(SGPR) 1144 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1145 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1146 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1147 // bit 1 = slc, 1148 // bit 2 = dlc on gfx10+), 1149 // swizzled buffer (bit 3 = swz)) 1150 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1151 AMDGPURsrcIntrinsic<0>; 1152def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad; 1153 1154class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1155 [data_ty], 1156 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1157 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1158 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1159 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1160 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1161 // bit 3 = swz, bit 4 = scc (gfx90a) 1162 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1163 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1164 // bit 6 = swz 1165 // all: volatile op (bit 31, stripped at lowering) 1166 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1167 ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1168 AMDGPURsrcIntrinsic<0>; 1169def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>; 1170def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; 1171 1172class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1173 [data_ty], 1174 [AMDGPUBufferRsrcTy,// rsrc(SGPR) 1175 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1176 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1177 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1178 // bit 1 = slc, 1179 // bit 2 = dlc on gfx10+), 1180 // swizzled buffer (bit 3 = swz)) 1181 [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1182 AMDGPURsrcIntrinsic<0>; 1183def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad; 1184 1185class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1186 [data_ty], 1187 [llvm_v4i32_ty, // rsrc(SGPR) 1188 llvm_i32_ty, // vindex(VGPR) 1189 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1190 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1191 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1192 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1193 // bit 3 = swz, bit 4 = scc (gfx90a) 1194 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1195 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1196 // bit 6 = swz 1197 // all: volatile op (bit 31, stripped at lowering) 1198 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1199 AMDGPURsrcIntrinsic<0>; 1200def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1201def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1202 1203class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1204 [data_ty], 1205 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1206 llvm_i32_ty, // vindex(VGPR) 1207 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1208 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1209 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1210 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1211 // bit 3 = swz, bit 4 = scc (gfx90a) 1212 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1213 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1214 // bit 6 = swz 1215 // all: volatile op (bit 31, stripped at lowering) 1216 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1217 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1218 AMDGPURsrcIntrinsic<0>; 1219def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad; 1220def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad; 1221 1222class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1223 [], 1224 [data_ty, // vdata(VGPR) 1225 llvm_v4i32_ty, // rsrc(SGPR) 1226 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1227 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1228 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1229 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1230 // bit 3 = swz, bit 4 = scc (gfx90a) 1231 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1232 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1233 // bit 6 = swz 1234 // all: volatile op (bit 31, stripped at lowering) 1235 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1236 AMDGPURsrcIntrinsic<1>; 1237def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1238def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1239 1240class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1241 [], 1242 [data_ty, // vdata(VGPR) 1243 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1244 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1245 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1246 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1247 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1248 // bit 3 = swz, bit 4 = scc (gfx90a) 1249 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1250 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1251 // bit 6 = swz 1252 // all: volatile op (bit 31, stripped at lowering) 1253 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1254 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1255 AMDGPURsrcIntrinsic<1>; 1256def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>; 1257def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore; 1258 1259class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1260 [], 1261 [data_ty, // vdata(VGPR) 1262 llvm_v4i32_ty, // rsrc(SGPR) 1263 llvm_i32_ty, // vindex(VGPR) 1264 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1265 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1266 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1267 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1268 // bit 3 = swz, bit 4 = scc (gfx90a) 1269 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1270 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1271 // bit 6 = swz 1272 // all: volatile op (bit 31, stripped at lowering) 1273 [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1274 AMDGPURsrcIntrinsic<1>; 1275def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1276def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1277 1278class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1279 [], 1280 [data_ty, // vdata(VGPR) 1281 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1282 llvm_i32_ty, // vindex(VGPR) 1283 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1284 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1285 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1286 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1287 // bit 3 = swz, bit 4 = scc (gfx90a) 1288 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1289 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1290 // bit 6 = swz 1291 // all: volatile op (bit 31, stripped at lowering) 1292 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1293 ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1294 AMDGPURsrcIntrinsic<1>; 1295def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore; 1296def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore; 1297 1298class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1299 [data_ty], 1300 [LLVMMatchType<0>, // vdata(VGPR) 1301 llvm_v4i32_ty, // rsrc(SGPR) 1302 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1303 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1304 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1305 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1306 AMDGPURsrcIntrinsic<1, 0>; 1307def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1308def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1309def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1310def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1311def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1312def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1313def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1314def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1315def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1316def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1317def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1318def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1319def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1320def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1321def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; 1322def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1323 [llvm_anyint_ty], 1324 [LLVMMatchType<0>, // src(VGPR) 1325 LLVMMatchType<0>, // cmp(VGPR) 1326 llvm_v4i32_ty, // rsrc(SGPR) 1327 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1328 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1329 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1330 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1331 AMDGPURsrcIntrinsic<2, 0>; 1332 1333class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1334 [data_ty], 1335 [LLVMMatchType<0>, // vdata(VGPR) 1336 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1337 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1338 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1339 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1340 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1341 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1342 AMDGPURsrcIntrinsic<1, 0>; 1343 1344def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic; 1345def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic; 1346def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic; 1347def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic; 1348def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic; 1349def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1350def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic; 1351def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic; 1352def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1353def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic; 1354def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; 1355def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; 1356def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; 1357def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; 1358def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; 1359def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< 1360 [llvm_anyint_ty], 1361 [LLVMMatchType<0>, // src(VGPR) 1362 LLVMMatchType<0>, // cmp(VGPR) 1363 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1364 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1365 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1366 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1367 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1368 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1369 AMDGPURsrcIntrinsic<2, 0>; 1370 1371// gfx908 intrinsic 1372def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1373 1374// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx12+. 1375def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1376 1377class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1378 [data_ty], 1379 [LLVMMatchType<0>, // vdata(VGPR) 1380 llvm_v4i32_ty, // rsrc(SGPR) 1381 llvm_i32_ty, // vindex(VGPR) 1382 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1383 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1384 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1385 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1386 AMDGPURsrcIntrinsic<1, 0>; 1387def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1388def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1389def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1390def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1391def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1392def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1393def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1394def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1395def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1396def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1397def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1398def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1399def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; 1400def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1401 [llvm_anyint_ty], 1402 [LLVMMatchType<0>, // src(VGPR) 1403 LLVMMatchType<0>, // cmp(VGPR) 1404 llvm_v4i32_ty, // rsrc(SGPR) 1405 llvm_i32_ty, // vindex(VGPR) 1406 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1407 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1408 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1409 [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1410 AMDGPURsrcIntrinsic<2, 0>; 1411 1412class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1413 [data_ty], 1414 [LLVMMatchType<0>, // vdata(VGPR) 1415 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1416 llvm_i32_ty, // vindex(VGPR) 1417 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1418 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1419 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1420 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1421 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1422 AMDGPURsrcIntrinsic<1, 0>; 1423def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic; 1424def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic; 1425def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic; 1426def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic; 1427def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic; 1428def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic; 1429def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic; 1430def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic; 1431def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; 1432def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; 1433def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; 1434def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; 1435def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; 1436def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< 1437 [llvm_anyint_ty], 1438 [LLVMMatchType<0>, // src(VGPR) 1439 LLVMMatchType<0>, // cmp(VGPR) 1440 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1441 llvm_i32_ty, // vindex(VGPR) 1442 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1443 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1444 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1445 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1446 ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1447 AMDGPURsrcIntrinsic<2, 0>; 1448 1449// gfx908 intrinsic 1450def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1451def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1452 1453// gfx90a intrinsics 1454def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1455def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1456 1457def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1458def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1459 1460// tbuffer intrinsics, with: 1461// - raw and struct variants 1462// - joint format field 1463// - joint cachepolicy field 1464def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < 1465 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1466 [llvm_v4i32_ty, // rsrc(SGPR) 1467 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1468 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1469 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1470 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1471 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1472 // bit 3 = swz, bit 4 = scc (gfx90a) 1473 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1474 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1475 // bit 6 = swz 1476 [IntrReadMem, 1477 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1478 AMDGPURsrcIntrinsic<0>; 1479 1480def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1481 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1482 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1483 llvm_i32_ty, // offset(VGPR/imm, included in bounds` checking and swizzling) 1484 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1485 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1486 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1487 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1488 // bit 3 = swz, bit 4 = scc (gfx90a) 1489 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1490 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1491 // bit 6 = swz 1492 // all: volatile op (bit 31, stripped at lowering) 1493 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1494 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1495 AMDGPURsrcIntrinsic<0>; 1496 1497def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < 1498 [], 1499 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1500 llvm_v4i32_ty, // rsrc(SGPR) 1501 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1502 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1503 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1504 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1505 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1506 // bit 3 = swz, bit 4 = scc (gfx90a) 1507 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1508 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1509 // bit 6 = swz 1510 // all: volatile op (bit 31, stripped at lowering) 1511 [IntrWriteMem, 1512 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1513 AMDGPURsrcIntrinsic<1>; 1514 1515def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1516 [], 1517 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1518 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1519 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1520 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1521 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1522 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1523 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1524 // bit 3 = swz, bit 4 = scc (gfx90a) 1525 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1526 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1527 // bit 6 = swz 1528 // all: volatile op (bit 31, stripped at lowering) 1529 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1530 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1531 AMDGPURsrcIntrinsic<1>; 1532 1533def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < 1534 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1535 [llvm_v4i32_ty, // rsrc(SGPR) 1536 llvm_i32_ty, // vindex(VGPR) 1537 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1538 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1539 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1540 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1541 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1542 // bit 3 = swz, bit 4 = scc (gfx90a) 1543 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1544 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1545 // bit 6 = swz 1546 // all: volatile op (bit 31, stripped at lowering) 1547 [IntrReadMem, 1548 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1549 AMDGPURsrcIntrinsic<0>; 1550 1551def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1552 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1553 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1554 llvm_i32_ty, // vindex(VGPR) 1555 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1556 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1557 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1558 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1559 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1560 // bit 3 = swz, bit 4 = scc (gfx90a) 1561 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1562 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1563 // bit 6 = swz 1564 // all: volatile op (bit 31, stripped at lowering) 1565 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1566 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1567 AMDGPURsrcIntrinsic<0>; 1568 1569def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1570 [], 1571 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1572 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1573 llvm_i32_ty, // vindex(VGPR) 1574 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1575 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1576 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1577 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1578 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1579 // bit 3 = swz, bit 4 = scc (gfx90a) 1580 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1581 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1582 // bit 6 = swz 1583 // all: volatile op (bit 31, stripped at lowering) 1584 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1585 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1586 AMDGPURsrcIntrinsic<1>; 1587 1588def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < 1589 [], 1590 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1591 llvm_v4i32_ty, // rsrc(SGPR) 1592 llvm_i32_ty, // vindex(VGPR) 1593 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1594 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1595 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1596 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1597 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1598 // bit 3 = swz, bit 4 = scc (gfx90a) 1599 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1600 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1601 // bit 6 = swz 1602 // all: volatile op (bit 31, stripped at lowering) 1603 [IntrWriteMem, 1604 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1605 AMDGPURsrcIntrinsic<1>; 1606 1607class AMDGPURawBufferLoadLDS : Intrinsic < 1608 [], 1609 [llvm_v4i32_ty, // rsrc(SGPR) 1610 LLVMQualPointerType<3>, // LDS base offset 1611 llvm_i32_ty, // Data byte size: 1/2/4 1612 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1613 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1614 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1615 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1616 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1617 // bit 3 = swz, bit 4 = scc (gfx90a) 1618 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1619 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1620 // bit 6 = swz 1621 // all: volatile op (bit 31, stripped at lowering) 1622 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1623 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1624def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1625 1626class AMDGPURawPtrBufferLoadLDS : Intrinsic < 1627 [], 1628 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1629 LLVMQualPointerType<3>, // LDS base offset 1630 llvm_i32_ty, // Data byte size: 1/2/4 1631 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1632 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1633 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1634 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1635 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1636 // bit 3 = swz, bit 4 = scc (gfx90a) 1637 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1638 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1639 // bit 6 = swz 1640 // all: volatile op (bit 31, stripped at lowering) 1641 [IntrWillReturn, IntrArgMemOnly, 1642 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1643 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1644 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1645 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1646def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS; 1647 1648class AMDGPUStructBufferLoadLDS : Intrinsic < 1649 [], 1650 [llvm_v4i32_ty, // rsrc(SGPR) 1651 LLVMQualPointerType<3>, // LDS base offset 1652 llvm_i32_ty, // Data byte size: 1/2/4 1653 llvm_i32_ty, // vindex(VGPR) 1654 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1655 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1656 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1657 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1658 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1659 // bit 3 = swz, bit 4 = scc (gfx90a) 1660 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1661 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1662 // bit 6 = swz 1663 // all: volatile op (bit 31, stripped at lowering) 1664 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1665 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1666def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1667 1668class AMDGPUStructPtrBufferLoadLDS : Intrinsic < 1669 [], 1670 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1671 LLVMQualPointerType<3>, // LDS base offset 1672 llvm_i32_ty, // Data byte size: 1/2/4 1673 llvm_i32_ty, // vindex(VGPR) 1674 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1675 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1676 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1677 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1678 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1679 // bit 3 = swz, bit 4 = scc (gfx90a) 1680 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1681 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1682 // bit 6 = swz 1683 // all: volatile op (bit 31, stripped at lowering) 1684 [IntrWillReturn, IntrArgMemOnly, 1685 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1686 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1687 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1688 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1689def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS; 1690 1691} // defset AMDGPUBufferIntrinsics 1692 1693// Uses that do not set the done bit should set IntrWriteMem on the 1694// call site. 1695def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ 1696 llvm_i32_ty, // tgt, 1697 llvm_i32_ty, // en 1698 llvm_any_ty, // src0 (f32 or i32) 1699 LLVMMatchType<0>, // src1 1700 LLVMMatchType<0>, // src2 1701 LLVMMatchType<0>, // src3 1702 llvm_i1_ty, // done 1703 llvm_i1_ty // vm (ignored on GFX11+) 1704 ], 1705 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1706 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] 1707>; 1708 1709// exp with row_en bit set. Only supported on GFX11+. 1710def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ 1711 llvm_i32_ty, // tgt, 1712 llvm_i32_ty, // en 1713 llvm_any_ty, // src0 (f32 or i32) 1714 LLVMMatchType<0>, // src1 1715 LLVMMatchType<0>, // src2 1716 LLVMMatchType<0>, // src3 1717 llvm_i1_ty, // done 1718 llvm_i32_ty], // row number 1719 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1720 IntrWriteMem, IntrInaccessibleMemOnly] 1721>; 1722 1723// exp with compr bit set. Not supported on GFX11+. 1724def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ 1725 llvm_i32_ty, // tgt, 1726 llvm_i32_ty, // en 1727 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1728 LLVMMatchType<0>, // src1 1729 llvm_i1_ty, // done 1730 llvm_i1_ty], // vm 1731 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1732 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] 1733>; 1734 1735def int_amdgcn_buffer_wbinvl1_sc : 1736 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1737 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1738 1739def int_amdgcn_buffer_wbinvl1 : 1740 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1741 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1742 1743def int_amdgcn_s_dcache_inv : 1744 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1745 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1746 1747def int_amdgcn_s_memtime : 1748 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1749 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; 1750 1751def int_amdgcn_s_sleep : 1752 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1753 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1754 IntrHasSideEffects]> { 1755} 1756 1757def int_amdgcn_s_sleep_var 1758 : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">, 1759 Intrinsic<[], [llvm_i32_ty], 1760 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> { 1761} 1762 1763def int_amdgcn_s_nop : 1764 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1765 IntrHasSideEffects]> { 1766} 1767 1768def int_amdgcn_s_incperflevel : 1769 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1770 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1771 IntrHasSideEffects]> { 1772} 1773 1774def int_amdgcn_s_decperflevel : 1775 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1776 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1777 IntrHasSideEffects]> { 1778} 1779 1780def int_amdgcn_s_sethalt : 1781 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1782 IntrHasSideEffects]>; 1783 1784def int_amdgcn_s_setprio : 1785 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1786 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1787 IntrHasSideEffects]>; 1788 1789def int_amdgcn_s_ttracedata : 1790 ClangBuiltin<"__builtin_amdgcn_s_ttracedata">, 1791 DefaultAttrsIntrinsic<[], [llvm_i32_ty], 1792 [IntrNoMem, IntrHasSideEffects]>; 1793 1794def int_amdgcn_s_ttracedata_imm : 1795 ClangBuiltin<"__builtin_amdgcn_s_ttracedata_imm">, 1796 DefaultAttrsIntrinsic<[], [llvm_i16_ty], 1797 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>; 1798 1799// This is IntrHasSideEffects so it can be used to read cycle counters. 1800def int_amdgcn_s_getreg : 1801 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1802 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], 1803 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1804>; 1805 1806// Note this can be used to set FP environment properties that are 1807// unsafe to change in non-strictfp functions. The register properties 1808// available (and value required to access them) may differ per 1809// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1810def int_amdgcn_s_setreg : 1811 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1812 DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1813 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1814>; 1815 1816// int_amdgcn_s_getpc is provided to allow a specific style of position 1817// independent code to determine the high part of its address when it is 1818// known (through convention) that the code and any data of interest does 1819// not cross a 4Gb address boundary. Use for any other purpose may not 1820// produce the desired results as optimizations may cause code movement, 1821// especially as we explicitly use IntrNoMem to allow optimizations. 1822// This intrinsic always returns PC sign-extended from 48 bits even if the 1823// s_getpc_b64 instruction returns a zero-extended value. 1824def int_amdgcn_s_getpc : 1825 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1826 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [NoUndef<RetIndex>, IntrNoMem, 1827 IntrSpeculatable, IntrWillReturn]>; 1828 1829// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1830// param values: 0 = P10, 1 = P20, 2 = P0 1831def int_amdgcn_interp_mov : 1832 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1833 DefaultAttrsIntrinsic<[llvm_float_ty], 1834 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1835 [IntrNoMem, IntrSpeculatable, 1836 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1837 1838// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1839// This intrinsic reads from lds, but the memory values are constant, 1840// so it behaves like IntrNoMem. 1841def int_amdgcn_interp_p1 : 1842 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1843 DefaultAttrsIntrinsic<[llvm_float_ty], 1844 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1845 [IntrNoMem, IntrSpeculatable, 1846 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1847 1848// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1849def int_amdgcn_interp_p2 : 1850 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 1851 DefaultAttrsIntrinsic<[llvm_float_ty], 1852 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1853 [IntrNoMem, IntrSpeculatable, 1854 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1855 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 1856 1857// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 1858// high selects whether high or low 16-bits are loaded from LDS 1859def int_amdgcn_interp_p1_f16 : 1860 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 1861 DefaultAttrsIntrinsic<[llvm_float_ty], 1862 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1863 [IntrNoMem, IntrSpeculatable, 1864 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1865 1866// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 1867// high selects whether high or low 16-bits are loaded from LDS 1868def int_amdgcn_interp_p2_f16 : 1869 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 1870 DefaultAttrsIntrinsic<[llvm_half_ty], 1871 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1872 [IntrNoMem, IntrSpeculatable, 1873 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1874 1875// llvm.amdgcn.lds.direct.load <m0> 1876// The input argument is m0, which contains a packed combination of address 1877// offset and flags describing the data type. 1878def int_amdgcn_lds_direct_load : 1879 DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 1880 [llvm_i32_ty], 1881 [IntrReadMem, IntrSpeculatable]>; 1882 1883// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 1884// Like interp intrinsics, this reads from lds, but the memory values are constant, 1885// so it behaves like IntrNoMem. 1886def int_amdgcn_lds_param_load : 1887 DefaultAttrsIntrinsic<[llvm_float_ty], 1888 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1889 [IntrNoMem, IntrSpeculatable, 1890 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 1891 1892// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 1893def int_amdgcn_interp_inreg_p10 : 1894 DefaultAttrsIntrinsic<[llvm_float_ty], 1895 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1896 [IntrNoMem, IntrSpeculatable]>; 1897 1898// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 1899def int_amdgcn_interp_inreg_p2 : 1900 DefaultAttrsIntrinsic<[llvm_float_ty], 1901 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1902 [IntrNoMem, IntrSpeculatable]>; 1903 1904// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 1905// high selects whether high or low 16-bits are used for p and p0 operands 1906def int_amdgcn_interp_inreg_p10_f16: 1907 DefaultAttrsIntrinsic<[llvm_float_ty], 1908 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1909 [IntrNoMem, IntrSpeculatable, 1910 ImmArg<ArgIndex<3>>]>; 1911 1912// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 1913// high selects whether high or low 16-bits are used for p operand 1914def int_amdgcn_interp_inreg_p2_f16 : 1915 DefaultAttrsIntrinsic<[llvm_half_ty], 1916 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1917 [IntrNoMem, IntrSpeculatable, 1918 ImmArg<ArgIndex<3>>]>; 1919 1920// llvm.amdgcn.interp.p10.rtz.f16 <p>, <i>, <p0>, <high> 1921// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode. 1922// high selects whether high or low 16-bits are used for p and p0 operands 1923def int_amdgcn_interp_p10_rtz_f16: 1924 DefaultAttrsIntrinsic<[llvm_float_ty], 1925 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1926 [IntrNoMem, IntrSpeculatable, 1927 ImmArg<ArgIndex<3>>]>; 1928 1929// llvm.amdgcn.interp.p2.rtz.f16 <p>, <j>, <tmp>, <high> 1930// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode. 1931// high selects whether high or low 16-bits are used for p operand 1932def int_amdgcn_interp_p2_rtz_f16 : 1933 DefaultAttrsIntrinsic<[llvm_half_ty], 1934 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1935 [IntrNoMem, IntrSpeculatable, 1936 ImmArg<ArgIndex<3>>]>; 1937 1938// Deprecated: use llvm.amdgcn.live.mask instead. 1939def int_amdgcn_ps_live : DefaultAttrsIntrinsic < 1940 [llvm_i1_ty], 1941 [], 1942 [IntrNoMem]>; 1943 1944// Query currently live lanes. 1945// Returns true if lane is live (and not a helper lane). 1946def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], 1947 [], [NoUndef<RetIndex>, IntrReadMem, IntrInaccessibleMemOnly] 1948>; 1949 1950def int_amdgcn_mbcnt_lo : 1951 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 1952 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1953 [IntrNoMem]>; 1954 1955def int_amdgcn_mbcnt_hi : 1956 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 1957 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1958 [IntrNoMem]>; 1959 1960// llvm.amdgcn.ds.swizzle src offset 1961def int_amdgcn_ds_swizzle : 1962 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 1963 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1964 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, 1965 ImmArg<ArgIndex<1>>]>; 1966 1967def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 1968 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1969 [IntrNoMem, IntrSpeculatable] 1970>; 1971 1972def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 1973 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1974 [IntrNoMem, IntrSpeculatable] 1975>; 1976 1977def int_amdgcn_lerp : 1978 ClangBuiltin<"__builtin_amdgcn_lerp">, 1979 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1980 [IntrNoMem, IntrSpeculatable] 1981>; 1982 1983def int_amdgcn_sad_u8 : 1984 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 1985 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1986 [IntrNoMem, IntrSpeculatable] 1987>; 1988 1989def int_amdgcn_msad_u8 : 1990 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 1991 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1992 [IntrNoMem, IntrSpeculatable] 1993>; 1994 1995def int_amdgcn_sad_hi_u8 : 1996 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 1997 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1998 [IntrNoMem, IntrSpeculatable] 1999>; 2000 2001def int_amdgcn_sad_u16 : 2002 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 2003 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2004 [IntrNoMem, IntrSpeculatable] 2005>; 2006 2007def int_amdgcn_qsad_pk_u16_u8 : 2008 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 2009 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2010 [IntrNoMem, IntrSpeculatable] 2011>; 2012 2013def int_amdgcn_mqsad_pk_u16_u8 : 2014 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 2015 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2016 [IntrNoMem, IntrSpeculatable] 2017>; 2018 2019def int_amdgcn_mqsad_u32_u8 : 2020 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 2021 DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 2022 [IntrNoMem, IntrSpeculatable] 2023>; 2024 2025def int_amdgcn_cvt_pk_u8_f32 : 2026 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 2027 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 2028 [IntrNoMem, IntrSpeculatable] 2029>; 2030 2031def int_amdgcn_icmp : 2032 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 2033 [IntrNoMem, IntrConvergent, 2034 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2035 2036def int_amdgcn_fcmp : 2037 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 2038 [IntrNoMem, IntrConvergent, 2039 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2040 2041def int_amdgcn_ballot : 2042 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 2043 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2044 2045def int_amdgcn_inverse_ballot : 2046 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], 2047 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2048 2049// Lowers to S_BITREPLICATE_B64_B32. 2050// The argument must be uniform; otherwise, the result is undefined. 2051def int_amdgcn_s_bitreplicate : 2052 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; 2053 2054// Lowers to S_QUADMASK_B{32,64} 2055// The argument must be uniform; otherwise, the result is undefined. 2056def int_amdgcn_s_quadmask : 2057 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2058 2059// Lowers to S_WQM_B{32,64} 2060// The argument must be uniform; otherwise, the result is undefined. 2061// Does not set WQM; merely calculates the bitmask. 2062def int_amdgcn_s_wqm : 2063 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2064 2065class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< 2066 [data_ty], 2067 [ 2068 LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) 2069 llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, 2070 // 1: Iterative strategy, and 2071 // 2. DPP) 2072 ], 2073 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; 2074 2075def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; 2076def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; 2077 2078def int_amdgcn_readfirstlane : 2079 Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], 2080 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2081 2082// The lane argument must be uniform across the currently active threads of the 2083// current wave. Otherwise, the result is undefined. 2084def int_amdgcn_readlane : 2085 Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], 2086 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2087 2088// The value to write and lane select arguments must be uniform across the 2089// currently active threads of the current wave. Otherwise, the result is 2090// undefined. 2091def int_amdgcn_writelane : 2092 Intrinsic<[llvm_any_ty], [ 2093 LLVMMatchType<0>, // uniform value to write: returned by the selected lane 2094 llvm_i32_ty, // uniform lane select 2095 LLVMMatchType<0> // returned by all lanes other than the selected one 2096 ], 2097 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2098>; 2099 2100def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 2101 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2102 [IntrNoMem, IntrSpeculatable] 2103>; 2104 2105// mul24 intrinsics can return i32 or i64. 2106// When returning i64, they're lowered to a mul24/mulhi24 pair. 2107def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2108 [llvm_i32_ty, llvm_i32_ty], 2109 [IntrNoMem, IntrSpeculatable] 2110>; 2111 2112def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2113 [llvm_i32_ty, llvm_i32_ty], 2114 [IntrNoMem, IntrSpeculatable] 2115>; 2116 2117def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2118 [llvm_i32_ty, llvm_i32_ty], 2119 [IntrNoMem, IntrSpeculatable] 2120>; 2121 2122def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2123 [llvm_i32_ty, llvm_i32_ty], 2124 [IntrNoMem, IntrSpeculatable] 2125>; 2126 2127// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 2128// 2129// bar_val is the total number of waves that will wait on this 2130// barrier, minus 1. 2131def int_amdgcn_ds_gws_init : 2132 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 2133 Intrinsic<[], 2134 [llvm_i32_ty, llvm_i32_ty], 2135 [IntrConvergent, IntrWriteMem, 2136 IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2137 [SDNPMemOperand] 2138>; 2139 2140// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 2141// bar_val is the total number of waves that will wait on this 2142// barrier, minus 1. 2143def int_amdgcn_ds_gws_barrier : 2144 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 2145 Intrinsic<[], 2146 [llvm_i32_ty, llvm_i32_ty], 2147 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2148 [SDNPMemOperand] 2149>; 2150 2151// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 2152def int_amdgcn_ds_gws_sema_v : 2153 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 2154 Intrinsic<[], 2155 [llvm_i32_ty], 2156 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2157 [SDNPMemOperand] 2158>; 2159 2160// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 2161def int_amdgcn_ds_gws_sema_br : 2162 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 2163 Intrinsic<[], 2164 [llvm_i32_ty, llvm_i32_ty], 2165 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2166 [SDNPMemOperand] 2167>; 2168 2169// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 2170def int_amdgcn_ds_gws_sema_p : 2171 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 2172 Intrinsic<[], 2173 [llvm_i32_ty], 2174 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2175 [SDNPMemOperand] 2176>; 2177 2178// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 2179def int_amdgcn_ds_gws_sema_release_all : 2180 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 2181 Intrinsic<[], 2182 [llvm_i32_ty], 2183 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2184 [SDNPMemOperand] 2185>; 2186 2187 2188// Copies the source value to the destination value, with the guarantee that 2189// the source value is computed as if the entire program were executed in WQM. 2190def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 2191 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2192>; 2193 2194// Copies the source value to the destination value, such that the source 2195// is computed as if the entire program were executed in WQM if any other 2196// program code executes in WQM. 2197def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 2198 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2199>; 2200 2201// Return true if at least one thread within the pixel quad passes true into 2202// the function. 2203def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 2204 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2205>; 2206 2207// If false, set EXEC=0 for the current thread until the end of program. 2208// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 2209def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; 2210 2211def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 2212 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent, 2213 IntrNoCallback, IntrNoFree] 2214>; 2215 2216// If false, mark all active lanes as helper lanes until the end of program. 2217def int_amdgcn_wqm_demote : Intrinsic<[], 2218 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] 2219>; 2220 2221// Copies the active channels of the source value to the destination value, 2222// with the guarantee that the source value is computed as if the entire 2223// program were executed in Whole Wavefront Mode, i.e. with all channels 2224// enabled, with a few exceptions: - Phi nodes which require WWM return an 2225// undefined value. 2226def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 2227 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2228 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2229>; 2230// Deprecated. Use int_amdgcn_strict_wwm instead. 2231def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 2232 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2233 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2234>; 2235def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 2236 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2237 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2238>; 2239 2240// Given a value, copies it while setting all the inactive lanes to a given 2241// value. Note that OpenGL helper lanes are considered active, so if the 2242// program ever uses WQM, then the instruction and the first source will be 2243// computed in WQM. 2244def int_amdgcn_set_inactive : 2245 Intrinsic<[llvm_any_ty], 2246 [LLVMMatchType<0>, // value to be copied 2247 LLVMMatchType<0>], // value for the inactive lanes to take 2248 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2249 2250// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must 2251// be a VGPR function argument. 2252// Can only be used in functions with the `amdgpu_cs_chain` or 2253// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2254// flow. 2255def int_amdgcn_set_inactive_chain_arg : 2256 Intrinsic<[llvm_anyint_ty], 2257 [LLVMMatchType<0>, // value to be copied 2258 LLVMMatchType<0>], // value for the inactive lanes to take 2259 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2260 2261// Return if the given flat pointer points to a local memory address. 2262def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 2263 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2264 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2265>; 2266 2267// Return if the given flat pointer points to a prvate memory address. 2268def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 2269 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2270 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2271>; 2272 2273// A uniform tail call to a function with the `amdgpu_cs_chain` or 2274// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs 2275// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to 2276// the given function. 2277// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or 2278// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2279// flow. 2280def int_amdgcn_cs_chain: 2281 Intrinsic<[], 2282 [llvm_anyptr_ty, // The function to jump to. 2283 llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64). 2284 llvm_any_ty, // Arguments that will be copied into SGPRs (s0+). 2285 // Must be uniform. 2286 llvm_any_ty, // Arguments that will be copied into VGPRs (v8+). 2287 // Need not be uniform. 2288 llvm_i32_ty, // Flags. 2289 llvm_vararg_ty // Additional arguments. Only present if Flags is 2290 // non-zero. 2291 ], 2292 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>; 2293 2294 2295//===----------------------------------------------------------------------===// 2296// CI+ Intrinsics 2297//===----------------------------------------------------------------------===// 2298 2299def int_amdgcn_s_dcache_inv_vol : 2300 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 2301 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2302 2303def int_amdgcn_buffer_wbinvl1_vol : 2304 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 2305 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2306 2307//===----------------------------------------------------------------------===// 2308// VI Intrinsics 2309//===----------------------------------------------------------------------===// 2310 2311// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2312def int_amdgcn_mov_dpp : 2313 Intrinsic<[llvm_anyint_ty], 2314 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 2315 llvm_i1_ty], 2316 [IntrNoMem, IntrConvergent, IntrWillReturn, 2317 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 2318 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2319 2320// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2321// Should be equivalent to: 2322// v_mov_b32 <dest> <old> 2323// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2324def int_amdgcn_update_dpp : 2325 Intrinsic<[llvm_any_ty], 2326 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 2327 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 2328 [IntrNoMem, IntrConvergent, IntrWillReturn, 2329 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 2330 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2331 2332def int_amdgcn_s_dcache_wb : 2333 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 2334 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2335 2336def int_amdgcn_s_dcache_wb_vol : 2337 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 2338 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2339 2340def int_amdgcn_s_memrealtime : 2341 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 2342 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2343 2344// llvm.amdgcn.ds.permute <index> <src> 2345def int_amdgcn_ds_permute : 2346 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 2347 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2348 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2349 2350// llvm.amdgcn.ds.bpermute <index> <src> 2351def int_amdgcn_ds_bpermute : 2352 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 2353 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2354 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2355 2356// llvm.amdgcn.perm <src0> <src1> <selector> 2357def int_amdgcn_perm : 2358 ClangBuiltin<"__builtin_amdgcn_perm">, 2359 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2360 [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2361 2362//===----------------------------------------------------------------------===// 2363// GFX9 Intrinsics 2364//===----------------------------------------------------------------------===// 2365 2366class AMDGPUGlobalLoadLDS : 2367 ClangBuiltin<"__builtin_amdgcn_global_load_lds">, 2368 Intrinsic < 2369 [], 2370 [LLVMQualPointerType<1>, // Base global pointer to load from 2371 LLVMQualPointerType<3>, // LDS base pointer to store to 2372 llvm_i32_ty, // Data byte size: 1/2/4 2373 llvm_i32_ty, // imm offset (applied to both global and LDS address) 2374 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, 2375 // bit 1 = sc1, 2376 // bit 4 = scc)) 2377 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 2378 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], 2379 "", [SDNPMemOperand]>; 2380def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 2381 2382// This is IntrHasSideEffects because it reads from a volatile hardware register. 2383def int_amdgcn_pops_exiting_wave_id : 2384 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>; 2385 2386//===----------------------------------------------------------------------===// 2387// GFX10 Intrinsics 2388//===----------------------------------------------------------------------===// 2389 2390// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 2391def int_amdgcn_permlane16 : 2392 Intrinsic<[llvm_any_ty], 2393 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2394 [IntrNoMem, IntrConvergent, IntrWillReturn, 2395 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2396 2397// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 2398def int_amdgcn_permlanex16 : 2399 Intrinsic<[llvm_any_ty], 2400 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2401 [IntrNoMem, IntrConvergent, IntrWillReturn, 2402 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2403 2404// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 2405// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 2406// the lanes to read from. 2407def int_amdgcn_mov_dpp8 : 2408 Intrinsic<[llvm_anyint_ty], 2409 [LLVMMatchType<0>, llvm_i32_ty], 2410 [IntrNoMem, IntrConvergent, IntrWillReturn, 2411 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; 2412 2413def int_amdgcn_s_get_waveid_in_workgroup : 2414 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 2415 Intrinsic<[llvm_i32_ty], [], 2416 [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2417 2418class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < 2419 [vt], 2420 [pt, // vaddr 2421 vt], // vdata(VGPR) 2422 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", 2423 [SDNPMemOperand]>; 2424 2425def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>; 2426 2427// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 2428// <ray_dir>, <ray_inv_dir>, <texture_descr> 2429// <node_ptr> is i32 or i64. 2430// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 2431def int_amdgcn_image_bvh_intersect_ray : 2432 DefaultAttrsIntrinsic<[llvm_v4i32_ty], 2433 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 2434 LLVMMatchType<1>, llvm_v4i32_ty], 2435 [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2436 2437//===----------------------------------------------------------------------===// 2438// GFX11 Intrinsics 2439//===----------------------------------------------------------------------===// 2440 2441// llvm.amdgcn.permlane64 <src0> 2442def int_amdgcn_permlane64 : 2443 Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], 2444 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2445 2446def int_amdgcn_ds_add_gs_reg_rtn : 2447 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 2448 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2449 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2450 "", [SDNPMemOperand]>; 2451 2452def int_amdgcn_ds_sub_gs_reg_rtn : 2453 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 2454 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2455 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2456 "", [SDNPMemOperand]>; 2457 2458def int_amdgcn_ds_bvh_stack_rtn : 2459 Intrinsic< 2460 [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr 2461 [ 2462 llvm_i32_ty, // %addr 2463 llvm_i32_ty, // %data0 2464 llvm_v4i32_ty, // %data1 2465 llvm_i32_ty, // %offset 2466 ], 2467 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2468 >; 2469 2470def int_amdgcn_s_wait_event_export_ready : 2471 ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, 2472 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] 2473>; 2474 2475// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2476// 2477// These operations perform a matrix multiplication and accumulation of 2478// the form: D = A * B + C . 2479 2480class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 2481 Intrinsic< 2482 [CD], // %D 2483 [ 2484 AB, // %A 2485 LLVMMatchType<1>, // %B 2486 LLVMMatchType<0>, // %C 2487 ], 2488 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2489>; 2490 2491class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2492 Intrinsic< 2493 [CD], // %D 2494 [ 2495 AB, // %A 2496 LLVMMatchType<1>, // %B 2497 LLVMMatchType<0>, // %C 2498 llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 2499 ], 2500 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2501>; 2502 2503class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2504 Intrinsic< 2505 [CD], // %D 2506 [ 2507 llvm_i1_ty, // %A_sign 2508 AB, // %A 2509 llvm_i1_ty, // %B_sign 2510 LLVMMatchType<1>, // %B 2511 LLVMMatchType<0>, // %C 2512 llvm_i1_ty, // %clamp 2513 ], 2514 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2515>; 2516 2517// WMMA GFX11Only 2518 2519// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2520// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. 2521// The content of the other 16-bit half is preserved from the input. 2522 2523defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = { 2524def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2525def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2526 2527// WMMA GFX11Plus 2528 2529def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2530def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2531def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2532def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2533 2534// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2535// The content of the other 16-bit half is undefined. 2536// GFX12: The op_sel bit must be 0. 2537def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2538def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2539} 2540 2541//===----------------------------------------------------------------------===// 2542// GFX12 Intrinsics 2543//===----------------------------------------------------------------------===// 2544 2545// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control> 2546def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, 2547 Intrinsic<[llvm_i32_ty], 2548 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2549 [IntrNoMem, IntrConvergent, IntrWillReturn, 2550 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2551 2552// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control> 2553def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">, 2554 Intrinsic<[llvm_i32_ty], 2555 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2556 [IntrNoMem, IntrConvergent, IntrWillReturn, 2557 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2558 2559// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics 2560// 2561// These operations perform a sparse matrix multiplication and accumulation of 2562// the form: D = A * B + C. 2563// A is sparse matrix, half the size of B, and is expanded using sparsity index. 2564 2565class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2566 Intrinsic< 2567 [CD], // %D 2568 [ 2569 A, // %A 2570 B, // %B 2571 LLVMMatchType<0>, // %C 2572 Index // %Sparsity index for A 2573 ], 2574 [IntrNoMem, IntrConvergent, IntrWillReturn] 2575>; 2576 2577class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2578 Intrinsic< 2579 [CD], // %D 2580 [ 2581 llvm_i1_ty, // %A_sign 2582 A, // %A 2583 llvm_i1_ty, // %B_sign 2584 B, // %B 2585 LLVMMatchType<0>, // %C 2586 Index, // %Sparsity index for A 2587 llvm_i1_ty, // %clamp 2588 ], 2589 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>] 2590>; 2591 2592defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = { 2593// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2594// 2595// These operations perform a matrix multiplication and accumulation of 2596// the form: D = A * B + C . 2597 2598// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. 2599def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2600def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2601def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2602def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2603// A and B are <16 x iu4>. 2604def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2605 2606def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2607def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2608def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2609def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2610def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2611def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2612def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2613def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2614def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2615def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2616def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2617} 2618 2619def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>; 2620 2621def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2622def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2623def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2624def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2625 2626def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; 2627 2628class AMDGPULoadIntrinsic<LLVMType ptr_ty>: 2629 Intrinsic< 2630 [llvm_any_ty], 2631 [ptr_ty], 2632 [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], 2633 "", 2634 [SDNPMemOperand] 2635 >; 2636 2637// Wave32 2638// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64 2639// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128 2640// Wave64 2641// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64 2642// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128 2643 2644def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; 2645def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; 2646 2647// i32 @llvm.amdgcn.wave.id() 2648def int_amdgcn_wave_id : 2649 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>; 2650 2651//===----------------------------------------------------------------------===// 2652// Deep learning intrinsics. 2653//===----------------------------------------------------------------------===// 2654 2655// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2656// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2657def int_amdgcn_fdot2 : 2658 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2659 DefaultAttrsIntrinsic< 2660 [llvm_float_ty], // %r 2661 [ 2662 llvm_v2f16_ty, // %a 2663 llvm_v2f16_ty, // %b 2664 llvm_float_ty, // %c 2665 llvm_i1_ty // %clamp 2666 ], 2667 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2668 >; 2669 2670// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2671// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2672def int_amdgcn_fdot2_f16_f16 : 2673 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2674 DefaultAttrsIntrinsic< 2675 [llvm_half_ty], // %r 2676 [ 2677 llvm_v2f16_ty, // %a 2678 llvm_v2f16_ty, // %b 2679 llvm_half_ty // %c 2680 ], 2681 [IntrNoMem, IntrSpeculatable] 2682 >; 2683 2684// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2685// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2686def int_amdgcn_fdot2_bf16_bf16 : 2687 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2688 DefaultAttrsIntrinsic< 2689 [llvm_bfloat_ty], // %r 2690 [ 2691 llvm_v2bf16_ty, // %a 2692 llvm_v2bf16_ty, // %b 2693 llvm_bfloat_ty // %c 2694 ], 2695 [IntrNoMem, IntrSpeculatable] 2696 >; 2697 2698// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2699// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2700def int_amdgcn_fdot2_f32_bf16 : 2701 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2702 DefaultAttrsIntrinsic< 2703 [llvm_float_ty], // %r 2704 [ 2705 llvm_v2bf16_ty, // %a 2706 llvm_v2bf16_ty, // %b 2707 llvm_float_ty, // %c 2708 llvm_i1_ty // %clamp 2709 ], 2710 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2711 >; 2712 2713// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2714// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2715def int_amdgcn_sdot2 : 2716 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2717 DefaultAttrsIntrinsic< 2718 [llvm_i32_ty], // %r 2719 [ 2720 llvm_v2i16_ty, // %a 2721 llvm_v2i16_ty, // %b 2722 llvm_i32_ty, // %c 2723 llvm_i1_ty // %clamp 2724 ], 2725 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2726 >; 2727 2728// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2729// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2730def int_amdgcn_udot2 : 2731 ClangBuiltin<"__builtin_amdgcn_udot2">, 2732 DefaultAttrsIntrinsic< 2733 [llvm_i32_ty], // %r 2734 [ 2735 llvm_v2i16_ty, // %a 2736 llvm_v2i16_ty, // %b 2737 llvm_i32_ty, // %c 2738 llvm_i1_ty // %clamp 2739 ], 2740 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2741 >; 2742 2743// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2744// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2745def int_amdgcn_sdot4 : 2746 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2747 DefaultAttrsIntrinsic< 2748 [llvm_i32_ty], // %r 2749 [ 2750 llvm_i32_ty, // %a 2751 llvm_i32_ty, // %b 2752 llvm_i32_ty, // %c 2753 llvm_i1_ty // %clamp 2754 ], 2755 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2756 >; 2757 2758// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2759// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2760def int_amdgcn_udot4 : 2761 ClangBuiltin<"__builtin_amdgcn_udot4">, 2762 DefaultAttrsIntrinsic< 2763 [llvm_i32_ty], // %r 2764 [ 2765 llvm_i32_ty, // %a 2766 llvm_i32_ty, // %b 2767 llvm_i32_ty, // %c 2768 llvm_i1_ty // %clamp 2769 ], 2770 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2771 >; 2772 2773// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2774// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2775// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2776// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2777// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2778def int_amdgcn_sudot4 : 2779 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2780 DefaultAttrsIntrinsic< 2781 [llvm_i32_ty], // %r 2782 [ 2783 llvm_i1_ty, // %a_sign 2784 llvm_i32_ty, // %a 2785 llvm_i1_ty, // %b_sign 2786 llvm_i32_ty, // %b 2787 llvm_i32_ty, // %c 2788 llvm_i1_ty // %clamp 2789 ], 2790 [IntrNoMem, IntrSpeculatable, 2791 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2792 >; 2793 2794// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2795// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2796// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2797def int_amdgcn_sdot8 : 2798 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2799 DefaultAttrsIntrinsic< 2800 [llvm_i32_ty], // %r 2801 [ 2802 llvm_i32_ty, // %a 2803 llvm_i32_ty, // %b 2804 llvm_i32_ty, // %c 2805 llvm_i1_ty // %clamp 2806 ], 2807 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2808 >; 2809 2810// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2811// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2812// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2813def int_amdgcn_udot8 : 2814 ClangBuiltin<"__builtin_amdgcn_udot8">, 2815 DefaultAttrsIntrinsic< 2816 [llvm_i32_ty], // %r 2817 [ 2818 llvm_i32_ty, // %a 2819 llvm_i32_ty, // %b 2820 llvm_i32_ty, // %c 2821 llvm_i1_ty // %clamp 2822 ], 2823 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2824 >; 2825 2826// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2827// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2828// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2829// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2830// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2831// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2832 def int_amdgcn_sudot8 : 2833 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2834 DefaultAttrsIntrinsic< 2835 [llvm_i32_ty], // %r 2836 [ 2837 llvm_i1_ty, // %a_sign 2838 llvm_i32_ty, // %a 2839 llvm_i1_ty, // %b_sign 2840 llvm_i32_ty, // %b 2841 llvm_i32_ty, // %c 2842 llvm_i1_ty // %clamp 2843 ], 2844 [IntrNoMem, IntrSpeculatable, 2845 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2846 >; 2847 2848// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c) 2849// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2850class AMDGPU8bitFloatDot4Intrinsic : 2851 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2852 DefaultAttrsIntrinsic< 2853 [llvm_float_ty], // %r 2854 [ 2855 llvm_i32_ty, // %a 2856 llvm_i32_ty, // %b 2857 llvm_float_ty, // %c 2858 ], 2859 [IntrNoMem, IntrSpeculatable] 2860 >; 2861 2862def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 2863def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 2864def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 2865def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 2866 2867//===----------------------------------------------------------------------===// 2868// gfx908 intrinsics 2869// ===----------------------------------------------------------------------===// 2870 2871def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2872 2873// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 2874class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 2875 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2876 DefaultAttrsIntrinsic<[DestTy], 2877 [SrcABTy, SrcABTy, DestTy, 2878 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2879 [IntrConvergent, IntrNoMem, 2880 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2881 2882defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = { 2883def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 2884def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2885def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2886def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2887def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2888def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 2889def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2890def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2891def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2892def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2893def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 2894def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2895def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2896def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2897def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2898def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 2899def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2900def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2901def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2902def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2903} 2904 2905//===----------------------------------------------------------------------===// 2906// gfx90a intrinsics 2907// ===----------------------------------------------------------------------===// 2908 2909def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2910def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2911def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2912def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2913def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2914 2915defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = { 2916def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 2917def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2918def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2919def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2920def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2921 2922// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 2923// Three bits corresponding to the neg modifier applied to the respective 2924// source operand. 2925def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 2926def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 2927} 2928 2929//===----------------------------------------------------------------------===// 2930// gfx940 intrinsics 2931// ===----------------------------------------------------------------------===// 2932 2933class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 2934 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 2935 2936multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 2937 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2938 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 2939} 2940 2941// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 2942class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 2943 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2944 DefaultAttrsIntrinsic<[DestTy], 2945 [SrcA, SrcB, DestTy, llvm_i32_ty, 2946 llvm_i32_ty, llvm_i32_ty], 2947 [IntrConvergent, IntrNoMem, 2948 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2949 2950class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 2951 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 2952 2953multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 2954 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2955 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 2956} 2957 2958// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 2959def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 2960def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 2961 2962defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = { 2963def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 2964def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 2965def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 2966def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 2967 2968defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 2969defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 2970 2971def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2972def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2973def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2974def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2975def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2976def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2977 2978defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 2979defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 2980} 2981 2982// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 2983// byte_sel selects byte from srcA. 2984def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 2985 DefaultAttrsIntrinsic<[llvm_float_ty], 2986 [llvm_i32_ty, llvm_i32_ty], 2987 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2988 2989// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 2990def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 2991 DefaultAttrsIntrinsic<[llvm_float_ty], 2992 [llvm_i32_ty, llvm_i32_ty], 2993 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2994 2995// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 2996// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 2997def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 2998 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 2999 [llvm_i32_ty, llvm_i1_ty], 3000 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3001 3002// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 3003def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 3004 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3005 [llvm_i32_ty, llvm_i1_ty], 3006 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3007 3008// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3009// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 3010def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 3011 DefaultAttrsIntrinsic<[llvm_i32_ty], 3012 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3013 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3014 3015// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3016def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 3017 DefaultAttrsIntrinsic<[llvm_i32_ty], 3018 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3019 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3020 3021// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3022// byte_sel selects byte to write into vdst. 3023def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 3024 DefaultAttrsIntrinsic<[llvm_i32_ty], 3025 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3026 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3027 3028// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3029def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 3030 DefaultAttrsIntrinsic<[llvm_i32_ty], 3031 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3032 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3033 3034//===----------------------------------------------------------------------===// 3035// Special Intrinsics for backend internal use only. No frontend 3036// should emit calls to these. 3037// ===----------------------------------------------------------------------===// 3038// 3039// Control-flow intrinsics in LLVM IR are convergent because they represent the 3040// wave CFG, i.e., sets of threads that are "converged" or "execute in 3041// lock-step". But they exist during a small window in the lowering process, 3042// inserted after the structurizer and then translated to equivalent MIR 3043// pseudos. So rather than create convergence tokens for these builtins, we 3044// simply mark them as not convergent. 3045// 3046// This is really a workaround to allow control flow lowering in the presence of 3047// convergence control tokens. The corresponding MIR pseudos are marked as 3048// having side effects, which is sufficient to prevent optimizations without 3049// having to mark them as convergent. 3050def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3051 [llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3052>; 3053 3054def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3055 [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3056>; 3057 3058def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 3059 [llvm_i1_ty, LLVMMatchType<0>], 3060 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree] 3061>; 3062 3063def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 3064 [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3065>; 3066 3067def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 3068 [IntrWillReturn, IntrNoCallback, IntrNoFree]>; 3069 3070// Represent unreachable in a divergent region. 3071def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; 3072 3073// Emit 2.5 ulp, no denormal division. Should only be inserted by 3074// pass based on !fpmath metadata. 3075def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< 3076 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 3077 [IntrNoMem, IntrSpeculatable] 3078>; 3079 3080/// Emit an addrspacecast without null pointer checking. 3081/// Should only be inserted by a pass based on analysis of an addrspacecast's src. 3082def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< 3083 [llvm_anyptr_ty], [llvm_anyptr_ty], 3084 [IntrNoMem, IntrSpeculatable] 3085>; 3086} 3087