1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Custom DAG lowering for SI 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIISelLowering.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "llvm/ADT/APInt.h" 23 #include "llvm/ADT/FloatingPointMode.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/UniformityAnalysis.h" 27 #include "llvm/CodeGen/Analysis.h" 28 #include "llvm/CodeGen/ByteProvider.h" 29 #include "llvm/CodeGen/FunctionLoweringInfo.h" 30 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" 31 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 32 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineLoopInfo.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/IRBuilder.h" 38 #include "llvm/IR/IntrinsicInst.h" 39 #include "llvm/IR/IntrinsicsAMDGPU.h" 40 #include "llvm/IR/IntrinsicsR600.h" 41 #include "llvm/IR/MDBuilder.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/KnownBits.h" 44 #include "llvm/Support/ModRef.h" 45 #include "llvm/Transforms/Utils/LowerAtomic.h" 46 #include <optional> 47 48 using namespace llvm; 49 50 #define DEBUG_TYPE "si-lower" 51 52 STATISTIC(NumTailCalls, "Number of tail calls"); 53 54 static cl::opt<bool> 55 DisableLoopAlignment("amdgpu-disable-loop-alignment", 56 cl::desc("Do not align and prefetch loops"), 57 cl::init(false)); 58 59 static cl::opt<bool> UseDivergentRegisterIndexing( 60 "amdgpu-use-divergent-register-indexing", cl::Hidden, 61 cl::desc("Use indirect register addressing for divergent indexes"), 62 cl::init(false)); 63 64 // TODO: This option should be removed once we switch to always using PTRADD in 65 // the SelectionDAG. 66 static cl::opt<bool> UseSelectionDAGPTRADD( 67 "amdgpu-use-sdag-ptradd", cl::Hidden, 68 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " 69 "SelectionDAG ISel"), 70 cl::init(false)); 71 72 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { 73 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 74 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); 75 } 76 77 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { 78 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 79 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); 80 } 81 82 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 83 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 84 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 85 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 86 return AMDGPU::SGPR0 + Reg; 87 } 88 } 89 llvm_unreachable("Cannot allocate sgpr"); 90 } 91 92 SITargetLowering::SITargetLowering(const TargetMachine &TM, 93 const GCNSubtarget &STI) 94 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) { 95 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 96 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 97 98 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 99 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 100 101 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 102 103 const SIRegisterInfo *TRI = STI.getRegisterInfo(); 104 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); 105 106 addRegisterClass(MVT::f64, V64RegClass); 107 addRegisterClass(MVT::v2f32, V64RegClass); 108 addRegisterClass(MVT::Untyped, V64RegClass); 109 110 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); 111 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); 112 113 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); 114 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); 115 116 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); 117 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); 118 119 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); 120 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); 121 122 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); 123 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); 124 125 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); 126 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); 127 128 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); 129 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); 130 131 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); 132 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); 133 134 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); 135 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); 136 137 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); 138 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); 139 140 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); 141 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); 142 143 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); 144 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); 145 146 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); 147 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); 148 149 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); 150 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); 151 152 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); 153 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); 154 155 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); 156 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); 157 158 if (Subtarget->has16BitInsts()) { 159 if (Subtarget->useRealTrue16Insts()) { 160 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); 161 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); 162 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass); 163 } else { 164 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); 165 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); 166 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass); 167 } 168 169 // Unless there are also VOP3P operations, not operations are really legal. 170 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); 171 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); 172 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass); 173 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); 174 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); 175 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass); 176 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); 177 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); 178 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass); 179 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); 180 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); 181 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass); 182 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); 183 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); 184 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass); 185 } 186 187 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); 188 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); 189 190 computeRegisterProperties(Subtarget->getRegisterInfo()); 191 192 // The boolean content concept here is too inflexible. Compares only ever 193 // really produce a 1-bit result. Any copy/extend from these will turn into a 194 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as 195 // it's what most targets use. 196 setBooleanContents(ZeroOrOneBooleanContent); 197 setBooleanVectorContents(ZeroOrOneBooleanContent); 198 199 // We need to custom lower vector stores from local memory 200 setOperationAction(ISD::LOAD, 201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, 204 MVT::i1, MVT::v32i32}, 205 Custom); 206 207 setOperationAction(ISD::STORE, 208 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, 211 MVT::i1, MVT::v32i32}, 212 Custom); 213 214 if (isTypeLegal(MVT::bf16)) { 215 for (unsigned Opc : 216 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, 217 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM, 218 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT, 219 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI, 220 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2, 221 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, 222 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, 223 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, 224 ISD::SETCC}) { 225 // FIXME: The promoted to type shouldn't need to be explicit 226 setOperationAction(Opc, MVT::bf16, Promote); 227 AddPromotedToType(Opc, MVT::bf16, MVT::f32); 228 } 229 230 setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); 231 232 setOperationAction(ISD::SELECT, MVT::bf16, Promote); 233 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16); 234 235 setOperationAction(ISD::FABS, MVT::bf16, Legal); 236 setOperationAction(ISD::FNEG, MVT::bf16, Legal); 237 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal); 238 239 // We only need to custom lower because we can't specify an action for bf16 240 // sources. 241 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 242 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 243 } 244 245 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 246 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); 247 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 248 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 249 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 250 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); 251 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); 252 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); 253 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); 254 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 255 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); 256 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 257 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); 258 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 259 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); 260 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); 261 262 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 263 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 264 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); 265 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); 266 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); 267 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); 268 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); 269 270 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); 271 272 setOperationAction(ISD::SELECT, MVT::i1, Promote); 273 setOperationAction(ISD::SELECT, MVT::i64, Custom); 274 setOperationAction(ISD::SELECT, MVT::f64, Promote); 275 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 276 277 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); 278 279 setOperationAction(ISD::SELECT_CC, 280 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); 281 282 setOperationAction(ISD::SETCC, MVT::i1, Promote); 283 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand); 284 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); 285 286 setOperationAction(ISD::TRUNCATE, 287 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 288 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 289 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32}, 290 Expand); 291 setOperationAction(ISD::FP_ROUND, 292 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, 293 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32, 294 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32}, 295 Expand); 296 297 setOperationAction(ISD::SIGN_EXTEND_INREG, 298 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16, 299 MVT::v3i16, MVT::v4i16, MVT::Other}, 300 Custom); 301 302 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 303 setOperationAction(ISD::BR_CC, 304 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); 305 306 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); 307 308 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); 309 310 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64, 311 Expand); 312 313 #if 0 314 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal); 315 #endif 316 317 // We only support LOAD/STORE and vector manipulation ops for vectors 318 // with > 4 elements. 319 for (MVT VT : 320 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, 321 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, 322 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, 323 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, 324 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, 325 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, 326 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, 327 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { 328 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 329 switch (Op) { 330 case ISD::LOAD: 331 case ISD::STORE: 332 case ISD::BUILD_VECTOR: 333 case ISD::BITCAST: 334 case ISD::UNDEF: 335 case ISD::EXTRACT_VECTOR_ELT: 336 case ISD::INSERT_VECTOR_ELT: 337 case ISD::SCALAR_TO_VECTOR: 338 case ISD::IS_FPCLASS: 339 break; 340 case ISD::EXTRACT_SUBVECTOR: 341 case ISD::INSERT_SUBVECTOR: 342 case ISD::CONCAT_VECTORS: 343 setOperationAction(Op, VT, Custom); 344 break; 345 default: 346 setOperationAction(Op, VT, Expand); 347 break; 348 } 349 } 350 } 351 352 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); 353 354 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 355 // is expanded to avoid having two separate loops in case the index is a VGPR. 356 357 // Most operations are naturally 32-bit vector operations. We only support 358 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 359 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) { 360 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 361 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 362 363 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 364 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 365 366 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 367 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 368 369 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 370 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 371 } 372 373 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) { 374 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 375 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); 376 377 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 378 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); 379 380 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 381 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); 382 383 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 384 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); 385 } 386 387 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) { 388 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 389 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); 390 391 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 392 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); 393 394 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 395 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); 396 397 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 398 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); 399 } 400 401 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) { 402 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 403 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); 404 405 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 406 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); 407 408 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 409 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); 410 411 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 412 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); 413 } 414 415 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) { 416 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 417 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); 418 419 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 420 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); 421 422 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 423 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); 424 425 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 426 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); 427 } 428 429 setOperationAction(ISD::VECTOR_SHUFFLE, 430 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32, 431 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32}, 432 Custom); 433 434 if (Subtarget->hasPkMovB32()) { 435 // TODO: 16-bit element vectors should be legal with even aligned elements. 436 // TODO: Can be legal with wider source types than the result with 437 // subregister extracts. 438 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal); 439 } 440 441 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, 442 Custom); 443 444 // Avoid stack access for these. 445 // TODO: Generalize to more vector types. 446 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, 447 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, 448 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16}, 449 Custom); 450 451 // Deal with vec3 vector operations when widened to vec4. 452 setOperationAction(ISD::INSERT_SUBVECTOR, 453 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom); 454 455 // Deal with vec5/6/7 vector operations when widened to vec8. 456 setOperationAction(ISD::INSERT_SUBVECTOR, 457 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 458 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 459 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 460 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 461 Custom); 462 463 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 464 // and output demarshalling 465 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom); 466 467 // We can't return success/failure, only the old value, 468 // let LLVM add the comparison 469 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64}, 470 Expand); 471 472 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); 473 474 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal); 475 476 // FIXME: This should be narrowed to i32, but that only happens if i64 is 477 // illegal. 478 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. 479 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal); 480 481 // On SI this is s_memtime and s_memrealtime on VI. 482 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 483 484 if (Subtarget->hasSMemRealTime() || 485 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) 486 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); 487 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom); 488 489 if (Subtarget->has16BitInsts()) { 490 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); 491 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); 492 } else { 493 setOperationAction(ISD::FSQRT, MVT::f16, Custom); 494 } 495 496 if (Subtarget->hasMadMacF32Insts()) 497 setOperationAction(ISD::FMAD, MVT::f32, Legal); 498 499 if (!Subtarget->hasBFI()) 500 // fcopysign can be done in a single instruction with BFI. 501 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); 502 503 if (!Subtarget->hasBCNT(32)) 504 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 505 506 if (!Subtarget->hasBCNT(64)) 507 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 508 509 if (Subtarget->hasFFBH()) 510 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); 511 512 if (Subtarget->hasFFBL()) 513 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); 514 515 // We only really have 32-bit BFE instructions (and 16-bit on VI). 516 // 517 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any 518 // effort to match them now. We want this to be false for i64 cases when the 519 // extraction isn't restricted to the upper or lower half. Ideally we would 520 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that 521 // span the midpoint are probably relatively rare, so don't worry about them 522 // for now. 523 if (Subtarget->hasBFE()) 524 setHasExtractBitsInsn(true); 525 526 // Clamp modifier on add/sub 527 if (Subtarget->hasIntClamp()) 528 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal); 529 530 if (Subtarget->hasAddNoCarry()) 531 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32}, 532 Legal); 533 534 setOperationAction( 535 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}, 536 {MVT::f32, MVT::f64}, Custom); 537 538 // These are really only legal for ieee_mode functions. We should be avoiding 539 // them for functions that don't have ieee_mode enabled, so just say they are 540 // legal. 541 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, 542 {MVT::f32, MVT::f64}, Legal); 543 544 if (Subtarget->haveRoundOpsF64()) 545 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, 546 Legal); 547 else 548 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, 549 MVT::f64, Custom); 550 551 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 552 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64}, 553 Legal); 554 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom); 555 556 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); 557 setOperationAction(ISD::FDIV, MVT::f64, Custom); 558 559 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand); 560 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); 561 562 // Custom lower these because we can't specify a rule based on an illegal 563 // source bf16. 564 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); 565 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom); 566 567 if (Subtarget->has16BitInsts()) { 568 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, 569 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, 570 MVT::i16, Legal); 571 572 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); 573 574 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC}, 575 MVT::i16, Expand); 576 577 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, 578 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, 579 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, 580 ISD::CTPOP}, 581 MVT::i16, Promote); 582 583 setOperationAction(ISD::LOAD, MVT::i16, Custom); 584 585 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 586 587 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); 588 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); 589 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); 590 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); 591 592 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); 593 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); 594 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i1, Custom); 595 596 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom); 597 598 // F16 - Constant Actions. 599 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 600 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 601 602 // F16 - Load/Store Actions. 603 setOperationAction(ISD::LOAD, MVT::f16, Promote); 604 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); 605 setOperationAction(ISD::STORE, MVT::f16, Promote); 606 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); 607 608 // BF16 - Load/Store Actions. 609 setOperationAction(ISD::LOAD, MVT::bf16, Promote); 610 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); 611 setOperationAction(ISD::STORE, MVT::bf16, Promote); 612 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); 613 614 // F16 - VOP1 Actions. 615 setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, 616 ISD::FSIN, ISD::FROUND}, 617 MVT::f16, Custom); 618 619 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); 620 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); 621 622 // F16 - VOP2 Actions. 623 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, 624 Expand); 625 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); 626 setOperationAction(ISD::FFREXP, MVT::f16, Custom); 627 setOperationAction(ISD::FDIV, MVT::f16, Custom); 628 629 // F16 - VOP3 Actions. 630 setOperationAction(ISD::FMA, MVT::f16, Legal); 631 if (STI.hasMadF16()) 632 setOperationAction(ISD::FMAD, MVT::f16, Legal); 633 634 for (MVT VT : 635 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, 636 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, 637 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) { 638 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 639 switch (Op) { 640 case ISD::LOAD: 641 case ISD::STORE: 642 case ISD::BUILD_VECTOR: 643 case ISD::BITCAST: 644 case ISD::UNDEF: 645 case ISD::EXTRACT_VECTOR_ELT: 646 case ISD::INSERT_VECTOR_ELT: 647 case ISD::INSERT_SUBVECTOR: 648 case ISD::SCALAR_TO_VECTOR: 649 case ISD::IS_FPCLASS: 650 break; 651 case ISD::EXTRACT_SUBVECTOR: 652 case ISD::CONCAT_VECTORS: 653 setOperationAction(Op, VT, Custom); 654 break; 655 default: 656 setOperationAction(Op, VT, Expand); 657 break; 658 } 659 } 660 } 661 662 // v_perm_b32 can handle either of these. 663 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal); 664 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); 665 666 // XXX - Do these do anything? Vector constants turn into build_vector. 667 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); 668 669 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 670 Legal); 671 672 setOperationAction(ISD::STORE, MVT::v2i16, Promote); 673 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); 674 setOperationAction(ISD::STORE, MVT::v2f16, Promote); 675 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); 676 677 setOperationAction(ISD::LOAD, MVT::v2i16, Promote); 678 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); 679 setOperationAction(ISD::LOAD, MVT::v2f16, Promote); 680 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); 681 682 setOperationAction(ISD::AND, MVT::v2i16, Promote); 683 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); 684 setOperationAction(ISD::OR, MVT::v2i16, Promote); 685 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); 686 setOperationAction(ISD::XOR, MVT::v2i16, Promote); 687 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); 688 689 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 690 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); 691 setOperationAction(ISD::LOAD, MVT::v4f16, Promote); 692 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); 693 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); 694 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); 695 696 setOperationAction(ISD::STORE, MVT::v4i16, Promote); 697 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); 698 setOperationAction(ISD::STORE, MVT::v4f16, Promote); 699 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); 700 setOperationAction(ISD::STORE, MVT::v4bf16, Promote); 701 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); 702 703 setOperationAction(ISD::LOAD, MVT::v8i16, Promote); 704 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); 705 setOperationAction(ISD::LOAD, MVT::v8f16, Promote); 706 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); 707 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); 708 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32); 709 710 setOperationAction(ISD::STORE, MVT::v4i16, Promote); 711 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); 712 setOperationAction(ISD::STORE, MVT::v4f16, Promote); 713 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); 714 715 setOperationAction(ISD::STORE, MVT::v8i16, Promote); 716 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); 717 setOperationAction(ISD::STORE, MVT::v8f16, Promote); 718 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); 719 setOperationAction(ISD::STORE, MVT::v8bf16, Promote); 720 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32); 721 722 setOperationAction(ISD::LOAD, MVT::v16i16, Promote); 723 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); 724 setOperationAction(ISD::LOAD, MVT::v16f16, Promote); 725 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); 726 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); 727 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32); 728 729 setOperationAction(ISD::STORE, MVT::v16i16, Promote); 730 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); 731 setOperationAction(ISD::STORE, MVT::v16f16, Promote); 732 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); 733 setOperationAction(ISD::STORE, MVT::v16bf16, Promote); 734 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32); 735 736 setOperationAction(ISD::LOAD, MVT::v32i16, Promote); 737 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); 738 setOperationAction(ISD::LOAD, MVT::v32f16, Promote); 739 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); 740 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote); 741 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32); 742 743 setOperationAction(ISD::STORE, MVT::v32i16, Promote); 744 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); 745 setOperationAction(ISD::STORE, MVT::v32f16, Promote); 746 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); 747 setOperationAction(ISD::STORE, MVT::v32bf16, Promote); 748 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32); 749 750 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 751 MVT::v2i32, Expand); 752 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); 753 754 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 755 MVT::v4i32, Expand); 756 757 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 758 MVT::v8i32, Expand); 759 760 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 761 Subtarget->hasVOP3PInsts() ? Legal : Custom); 762 763 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal); 764 // This isn't really legal, but this avoids the legalizer unrolling it (and 765 // allows matching fneg (fabs x) patterns) 766 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal); 767 768 // Can do this in one BFI plus a constant materialize. 769 setOperationAction(ISD::FCOPYSIGN, 770 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, 771 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, 772 MVT::v32f16, MVT::v32bf16}, 773 Custom); 774 775 setOperationAction( 776 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}, 777 MVT::f16, Custom); 778 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); 779 780 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM, 781 ISD::FMAXIMUMNUM}, 782 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 783 Custom); 784 785 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, 786 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 787 Expand); 788 789 for (MVT Vec16 : 790 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, 791 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { 792 setOperationAction( 793 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, 794 Vec16, Custom); 795 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); 796 } 797 } 798 799 if (Subtarget->hasVOP3PInsts()) { 800 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL, 801 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, 802 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, 803 MVT::v2i16, Legal); 804 805 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, 806 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, 807 MVT::v2f16, Legal); 808 809 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 810 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom); 811 812 setOperationAction(ISD::VECTOR_SHUFFLE, 813 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16, 814 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16, 815 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16}, 816 Custom); 817 818 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) 819 // Split vector operations. 820 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, 821 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, 822 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, 823 ISD::SSUBSAT}, 824 VT, Custom); 825 826 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) 827 // Split vector operations. 828 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, 829 VT, Custom); 830 831 setOperationAction( 832 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}, 833 {MVT::v2f16, MVT::v4f16}, Custom); 834 835 setOperationAction(ISD::FEXP, MVT::v2f16, Custom); 836 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, 837 Custom); 838 839 if (Subtarget->hasPackedFP32Ops()) { 840 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, 841 MVT::v2f32, Legal); 842 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA}, 843 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, 844 Custom); 845 } 846 } 847 848 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); 849 850 if (Subtarget->has16BitInsts()) { 851 setOperationAction(ISD::SELECT, MVT::v2i16, Promote); 852 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); 853 setOperationAction(ISD::SELECT, MVT::v2f16, Promote); 854 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); 855 } else { 856 // Legalization hack. 857 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom); 858 859 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom); 860 } 861 862 setOperationAction(ISD::SELECT, 863 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, 864 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, 865 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, 866 MVT::v32f16, MVT::v32bf16}, 867 Custom); 868 869 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); 870 871 if (Subtarget->hasScalarSMulU64()) 872 setOperationAction(ISD::MUL, MVT::i64, Custom); 873 874 if (Subtarget->hasMad64_32()) 875 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); 876 877 if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) 878 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 879 880 if (Subtarget->hasIEEEMinimumMaximumInsts()) { 881 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, 882 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); 883 } else { 884 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum 885 if (Subtarget->hasMinimum3Maximum3F32()) 886 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal); 887 888 if (Subtarget->hasMinimum3Maximum3PKF16()) { 889 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal); 890 891 // If only the vector form is available, we need to widen to a vector. 892 if (!Subtarget->hasMinimum3Maximum3F16()) 893 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom); 894 } 895 } 896 897 if (Subtarget->hasVOP3PInsts()) { 898 // We want to break these into v2f16 pieces, not scalarize. 899 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM}, 900 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 901 Custom); 902 } 903 904 setOperationAction(ISD::INTRINSIC_WO_CHAIN, 905 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, 906 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, 907 MVT::i8}, 908 Custom); 909 910 setOperationAction(ISD::INTRINSIC_W_CHAIN, 911 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, 912 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, 913 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, 914 MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, 915 Custom); 916 917 setOperationAction(ISD::INTRINSIC_VOID, 918 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, 919 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, 920 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, 921 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, 922 Custom); 923 924 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); 925 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); 926 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 927 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom); 928 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom); 929 930 // TODO: Could move this to custom lowering, could benefit from combines on 931 // extract of relevant bits. 932 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); 933 934 setOperationAction(ISD::MUL, MVT::i1, Promote); 935 936 if (Subtarget->hasBF16ConversionInsts()) { 937 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom); 938 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); 939 } 940 941 if (Subtarget->hasCvtPkF16F32Inst()) { 942 setOperationAction(ISD::FP_ROUND, 943 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16}, 944 Custom); 945 } 946 947 setTargetDAGCombine({ISD::ADD, 948 ISD::PTRADD, 949 ISD::UADDO_CARRY, 950 ISD::SUB, 951 ISD::USUBO_CARRY, 952 ISD::MUL, 953 ISD::FADD, 954 ISD::FSUB, 955 ISD::FDIV, 956 ISD::FMUL, 957 ISD::FMINNUM, 958 ISD::FMAXNUM, 959 ISD::FMINNUM_IEEE, 960 ISD::FMAXNUM_IEEE, 961 ISD::FMINIMUM, 962 ISD::FMAXIMUM, 963 ISD::FMINIMUMNUM, 964 ISD::FMAXIMUMNUM, 965 ISD::FMA, 966 ISD::SMIN, 967 ISD::SMAX, 968 ISD::UMIN, 969 ISD::UMAX, 970 ISD::SETCC, 971 ISD::SELECT, 972 ISD::SMIN, 973 ISD::SMAX, 974 ISD::UMIN, 975 ISD::UMAX, 976 ISD::AND, 977 ISD::OR, 978 ISD::XOR, 979 ISD::SHL, 980 ISD::SRL, 981 ISD::SRA, 982 ISD::FSHR, 983 ISD::SINT_TO_FP, 984 ISD::UINT_TO_FP, 985 ISD::FCANONICALIZE, 986 ISD::SCALAR_TO_VECTOR, 987 ISD::ZERO_EXTEND, 988 ISD::SIGN_EXTEND_INREG, 989 ISD::EXTRACT_VECTOR_ELT, 990 ISD::INSERT_VECTOR_ELT, 991 ISD::FCOPYSIGN}); 992 993 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) 994 setTargetDAGCombine(ISD::FP_ROUND); 995 996 // All memory operations. Some folding on the pointer operand is done to help 997 // matching the constant offsets in the addressing modes. 998 setTargetDAGCombine({ISD::LOAD, 999 ISD::STORE, 1000 ISD::ATOMIC_LOAD, 1001 ISD::ATOMIC_STORE, 1002 ISD::ATOMIC_CMP_SWAP, 1003 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, 1004 ISD::ATOMIC_SWAP, 1005 ISD::ATOMIC_LOAD_ADD, 1006 ISD::ATOMIC_LOAD_SUB, 1007 ISD::ATOMIC_LOAD_AND, 1008 ISD::ATOMIC_LOAD_OR, 1009 ISD::ATOMIC_LOAD_XOR, 1010 ISD::ATOMIC_LOAD_NAND, 1011 ISD::ATOMIC_LOAD_MIN, 1012 ISD::ATOMIC_LOAD_MAX, 1013 ISD::ATOMIC_LOAD_UMIN, 1014 ISD::ATOMIC_LOAD_UMAX, 1015 ISD::ATOMIC_LOAD_FADD, 1016 ISD::ATOMIC_LOAD_FMIN, 1017 ISD::ATOMIC_LOAD_FMAX, 1018 ISD::ATOMIC_LOAD_UINC_WRAP, 1019 ISD::ATOMIC_LOAD_UDEC_WRAP, 1020 ISD::INTRINSIC_VOID, 1021 ISD::INTRINSIC_W_CHAIN}); 1022 1023 // FIXME: In other contexts we pretend this is a per-function property. 1024 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); 1025 1026 setSchedulingPreference(Sched::RegPressure); 1027 } 1028 1029 const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; } 1030 1031 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { 1032 static const MCPhysReg RCRegs[] = {AMDGPU::MODE}; 1033 return RCRegs; 1034 } 1035 1036 //===----------------------------------------------------------------------===// 1037 // TargetLowering queries 1038 //===----------------------------------------------------------------------===// 1039 1040 // v_mad_mix* support a conversion from f16 to f32. 1041 // 1042 // There is only one special case when denormals are enabled we don't currently, 1043 // where this is OK to use. 1044 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, 1045 EVT DestVT, EVT SrcVT) const { 1046 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || 1047 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && 1048 DestVT.getScalarType() == MVT::f32 && 1049 SrcVT.getScalarType() == MVT::f16 && 1050 // TODO: This probably only requires no input flushing? 1051 denormalModeIsFlushAllF32(DAG.getMachineFunction()); 1052 } 1053 1054 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, 1055 LLT DestTy, LLT SrcTy) const { 1056 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || 1057 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && 1058 DestTy.getScalarSizeInBits() == 32 && 1059 SrcTy.getScalarSizeInBits() == 16 && 1060 // TODO: This probably only requires no input flushing? 1061 denormalModeIsFlushAllF32(*MI.getMF()); 1062 } 1063 1064 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { 1065 // SI has some legal vector types, but no legal vector operations. Say no 1066 // shuffles are legal in order to prefer scalarizing some vector operations. 1067 return false; 1068 } 1069 1070 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 1071 CallingConv::ID CC, 1072 EVT VT) const { 1073 if (CC == CallingConv::AMDGPU_KERNEL) 1074 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 1075 1076 if (VT.isVector()) { 1077 EVT ScalarVT = VT.getScalarType(); 1078 unsigned Size = ScalarVT.getSizeInBits(); 1079 if (Size == 16) { 1080 if (Subtarget->has16BitInsts()) { 1081 if (VT.isInteger()) 1082 return MVT::v2i16; 1083 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); 1084 } 1085 return VT.isInteger() ? MVT::i32 : MVT::f32; 1086 } 1087 1088 if (Size < 16) 1089 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; 1090 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; 1091 } 1092 1093 if (VT.getSizeInBits() > 32) 1094 return MVT::i32; 1095 1096 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 1097 } 1098 1099 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 1100 CallingConv::ID CC, 1101 EVT VT) const { 1102 if (CC == CallingConv::AMDGPU_KERNEL) 1103 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 1104 1105 if (VT.isVector()) { 1106 unsigned NumElts = VT.getVectorNumElements(); 1107 EVT ScalarVT = VT.getScalarType(); 1108 unsigned Size = ScalarVT.getSizeInBits(); 1109 1110 // FIXME: Should probably promote 8-bit vectors to i16. 1111 if (Size == 16 && Subtarget->has16BitInsts()) 1112 return (NumElts + 1) / 2; 1113 1114 if (Size <= 32) 1115 return NumElts; 1116 1117 if (Size > 32) 1118 return NumElts * ((Size + 31) / 32); 1119 } else if (VT.getSizeInBits() > 32) 1120 return (VT.getSizeInBits() + 31) / 32; 1121 1122 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 1123 } 1124 1125 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( 1126 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 1127 unsigned &NumIntermediates, MVT &RegisterVT) const { 1128 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { 1129 unsigned NumElts = VT.getVectorNumElements(); 1130 EVT ScalarVT = VT.getScalarType(); 1131 unsigned Size = ScalarVT.getSizeInBits(); 1132 // FIXME: We should fix the ABI to be the same on targets without 16-bit 1133 // support, but unless we can properly handle 3-vectors, it will be still be 1134 // inconsistent. 1135 if (Size == 16 && Subtarget->has16BitInsts()) { 1136 if (ScalarVT == MVT::bf16) { 1137 RegisterVT = MVT::i32; 1138 IntermediateVT = MVT::v2bf16; 1139 } else { 1140 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; 1141 IntermediateVT = RegisterVT; 1142 } 1143 NumIntermediates = (NumElts + 1) / 2; 1144 return NumIntermediates; 1145 } 1146 1147 if (Size == 32) { 1148 RegisterVT = ScalarVT.getSimpleVT(); 1149 IntermediateVT = RegisterVT; 1150 NumIntermediates = NumElts; 1151 return NumIntermediates; 1152 } 1153 1154 if (Size < 16 && Subtarget->has16BitInsts()) { 1155 // FIXME: Should probably form v2i16 pieces 1156 RegisterVT = MVT::i16; 1157 IntermediateVT = ScalarVT; 1158 NumIntermediates = NumElts; 1159 return NumIntermediates; 1160 } 1161 1162 if (Size != 16 && Size <= 32) { 1163 RegisterVT = MVT::i32; 1164 IntermediateVT = ScalarVT; 1165 NumIntermediates = NumElts; 1166 return NumIntermediates; 1167 } 1168 1169 if (Size > 32) { 1170 RegisterVT = MVT::i32; 1171 IntermediateVT = RegisterVT; 1172 NumIntermediates = NumElts * ((Size + 31) / 32); 1173 return NumIntermediates; 1174 } 1175 } 1176 1177 return TargetLowering::getVectorTypeBreakdownForCallingConv( 1178 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); 1179 } 1180 1181 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, 1182 const DataLayout &DL, Type *Ty, 1183 unsigned MaxNumLanes) { 1184 assert(MaxNumLanes != 0); 1185 1186 LLVMContext &Ctx = Ty->getContext(); 1187 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 1188 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); 1189 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()), 1190 NumElts); 1191 } 1192 1193 return TLI.getValueType(DL, Ty); 1194 } 1195 1196 // Peek through TFE struct returns to only use the data size. 1197 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, 1198 const DataLayout &DL, Type *Ty, 1199 unsigned MaxNumLanes) { 1200 auto *ST = dyn_cast<StructType>(Ty); 1201 if (!ST) 1202 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes); 1203 1204 // TFE intrinsics return an aggregate type. 1205 assert(ST->getNumContainedTypes() == 2 && 1206 ST->getContainedType(1)->isIntegerTy(32)); 1207 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes); 1208 } 1209 1210 /// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its 1211 /// in-memory representation. This return value is a custom type because there 1212 /// is no MVT::i160 and adding one breaks integer promotion logic. While this 1213 /// could cause issues during codegen, these address space 7 pointers will be 1214 /// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer 1215 /// in order to allow pre-codegen passes that query TargetTransformInfo, often 1216 /// for cost modeling, to work. (This also sets us up decently for doing the 1217 /// buffer lowering in GlobalISel if SelectionDAG ever goes away.) 1218 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { 1219 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) 1220 return MVT::amdgpuBufferFatPointer; 1221 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && 1222 DL.getPointerSizeInBits(AS) == 192) 1223 return MVT::amdgpuBufferStridedPointer; 1224 return AMDGPUTargetLowering::getPointerTy(DL, AS); 1225 } 1226 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka 1227 /// v8i32 when padding is added. 1228 /// The in-memory representation of a p9 is {p8, i32, i32}, which is 1229 /// also v8i32 with padding. 1230 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { 1231 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && 1232 DL.getPointerSizeInBits(AS) == 160) || 1233 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && 1234 DL.getPointerSizeInBits(AS) == 192)) 1235 return MVT::v8i32; 1236 return AMDGPUTargetLowering::getPointerMemTy(DL, AS); 1237 } 1238 1239 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 1240 const CallInst &CI, 1241 MachineFunction &MF, 1242 unsigned IntrID) const { 1243 Info.flags = MachineMemOperand::MONone; 1244 if (CI.hasMetadata(LLVMContext::MD_invariant_load)) 1245 Info.flags |= MachineMemOperand::MOInvariant; 1246 if (CI.hasMetadata(LLVMContext::MD_nontemporal)) 1247 Info.flags |= MachineMemOperand::MONonTemporal; 1248 Info.flags |= getTargetMMOFlags(CI); 1249 1250 if (const AMDGPU::RsrcIntrinsic *RsrcIntr = 1251 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 1252 AttributeSet Attr = 1253 Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID); 1254 MemoryEffects ME = Attr.getMemoryEffects(); 1255 if (ME.doesNotAccessMemory()) 1256 return false; 1257 1258 // TODO: Should images get their own address space? 1259 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; 1260 1261 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr; 1262 if (RsrcIntr->IsImage) { 1263 const AMDGPU::ImageDimIntrinsicInfo *Intr = 1264 AMDGPU::getImageDimIntrinsicInfo(IntrID); 1265 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1266 Info.align.reset(); 1267 } 1268 1269 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg); 1270 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) { 1271 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 1272 // We conservatively set the memory operand of a buffer intrinsic to the 1273 // base resource pointer, so that we can access alias information about 1274 // those pointers. Cases like "this points at the same value 1275 // but with a different offset" are handled in 1276 // areMemAccessesTriviallyDisjoint. 1277 Info.ptrVal = RsrcArg; 1278 } 1279 1280 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data; 1281 if (!IsSPrefetch) { 1282 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); 1283 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) 1284 Info.flags |= MachineMemOperand::MOVolatile; 1285 } 1286 1287 Info.flags |= MachineMemOperand::MODereferenceable; 1288 if (ME.onlyReadsMemory()) { 1289 if (RsrcIntr->IsImage) { 1290 unsigned MaxNumLanes = 4; 1291 1292 if (!BaseOpcode->Gather4) { 1293 // If this isn't a gather, we may have excess loaded elements in the 1294 // IR type. Check the dmask for the real number of elements loaded. 1295 unsigned DMask = 1296 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); 1297 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask); 1298 } 1299 1300 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(), 1301 CI.getType(), MaxNumLanes); 1302 } else { 1303 Info.memVT = 1304 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), 1305 std::numeric_limits<unsigned>::max()); 1306 } 1307 1308 // FIXME: What does alignment mean for an image? 1309 Info.opc = ISD::INTRINSIC_W_CHAIN; 1310 Info.flags |= MachineMemOperand::MOLoad; 1311 } else if (ME.onlyWritesMemory()) { 1312 Info.opc = ISD::INTRINSIC_VOID; 1313 1314 Type *DataTy = CI.getArgOperand(0)->getType(); 1315 if (RsrcIntr->IsImage) { 1316 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); 1317 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask); 1318 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy, 1319 DMaskLanes); 1320 } else 1321 Info.memVT = getValueType(MF.getDataLayout(), DataTy); 1322 1323 Info.flags |= MachineMemOperand::MOStore; 1324 } else { 1325 // Atomic, NoReturn Sampler or prefetch 1326 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID 1327 : ISD::INTRINSIC_W_CHAIN; 1328 Info.flags |= 1329 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; 1330 1331 if (!IsSPrefetch) 1332 Info.flags |= MachineMemOperand::MOStore; 1333 1334 switch (IntrID) { 1335 default: 1336 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) { 1337 // Fake memory access type for no return sampler intrinsics 1338 Info.memVT = MVT::i32; 1339 } else { 1340 // XXX - Should this be volatile without known ordering? 1341 Info.flags |= MachineMemOperand::MOVolatile; 1342 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); 1343 } 1344 break; 1345 case Intrinsic::amdgcn_raw_buffer_load_lds: 1346 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 1347 case Intrinsic::amdgcn_struct_buffer_load_lds: 1348 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 1349 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); 1350 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); 1351 Info.ptrVal = CI.getArgOperand(1); 1352 return true; 1353 } 1354 case Intrinsic::amdgcn_raw_atomic_buffer_load: 1355 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 1356 case Intrinsic::amdgcn_struct_atomic_buffer_load: 1357 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: { 1358 Info.memVT = 1359 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), 1360 std::numeric_limits<unsigned>::max()); 1361 Info.flags &= ~MachineMemOperand::MOStore; 1362 return true; 1363 } 1364 } 1365 } 1366 return true; 1367 } 1368 1369 switch (IntrID) { 1370 case Intrinsic::amdgcn_ds_ordered_add: 1371 case Intrinsic::amdgcn_ds_ordered_swap: { 1372 Info.opc = ISD::INTRINSIC_W_CHAIN; 1373 Info.memVT = MVT::getVT(CI.getType()); 1374 Info.ptrVal = CI.getOperand(0); 1375 Info.align.reset(); 1376 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1377 1378 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); 1379 if (!Vol->isZero()) 1380 Info.flags |= MachineMemOperand::MOVolatile; 1381 1382 return true; 1383 } 1384 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 1385 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { 1386 Info.opc = ISD::INTRINSIC_W_CHAIN; 1387 Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); 1388 Info.ptrVal = nullptr; 1389 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; 1390 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1391 return true; 1392 } 1393 case Intrinsic::amdgcn_ds_append: 1394 case Intrinsic::amdgcn_ds_consume: { 1395 Info.opc = ISD::INTRINSIC_W_CHAIN; 1396 Info.memVT = MVT::getVT(CI.getType()); 1397 Info.ptrVal = CI.getOperand(0); 1398 Info.align.reset(); 1399 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1400 1401 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); 1402 if (!Vol->isZero()) 1403 Info.flags |= MachineMemOperand::MOVolatile; 1404 1405 return true; 1406 } 1407 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: 1408 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: { 1409 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64) 1410 ? ISD::INTRINSIC_W_CHAIN 1411 : ISD::INTRINSIC_VOID; 1412 Info.memVT = MVT::getVT(CI.getType()); 1413 Info.ptrVal = CI.getOperand(0); 1414 Info.memVT = MVT::i64; 1415 Info.size = 8; 1416 Info.align.reset(); 1417 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1418 return true; 1419 } 1420 case Intrinsic::amdgcn_global_atomic_csub: { 1421 Info.opc = ISD::INTRINSIC_W_CHAIN; 1422 Info.memVT = MVT::getVT(CI.getType()); 1423 Info.ptrVal = CI.getOperand(0); 1424 Info.align.reset(); 1425 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | 1426 MachineMemOperand::MOVolatile; 1427 return true; 1428 } 1429 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: 1430 case Intrinsic::amdgcn_image_bvh_intersect_ray: 1431 case Intrinsic::amdgcn_image_bvh8_intersect_ray: { 1432 Info.opc = ISD::INTRINSIC_W_CHAIN; 1433 Info.memVT = 1434 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray 1435 ? CI.getType() 1436 : cast<StructType>(CI.getType()) 1437 ->getElementType(0)); // XXX: what is correct VT? 1438 1439 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; 1440 Info.align.reset(); 1441 Info.flags |= 1442 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; 1443 return true; 1444 } 1445 case Intrinsic::amdgcn_global_atomic_fmin_num: 1446 case Intrinsic::amdgcn_global_atomic_fmax_num: 1447 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 1448 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1449 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1450 case Intrinsic::amdgcn_atomic_cond_sub_u32: { 1451 Info.opc = ISD::INTRINSIC_W_CHAIN; 1452 Info.memVT = MVT::getVT(CI.getType()); 1453 Info.ptrVal = CI.getOperand(0); 1454 Info.align.reset(); 1455 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | 1456 MachineMemOperand::MODereferenceable | 1457 MachineMemOperand::MOVolatile; 1458 return true; 1459 } 1460 case Intrinsic::amdgcn_ds_load_tr6_b96: 1461 case Intrinsic::amdgcn_ds_load_tr4_b64: 1462 case Intrinsic::amdgcn_ds_load_tr8_b64: 1463 case Intrinsic::amdgcn_ds_load_tr16_b128: 1464 case Intrinsic::amdgcn_global_load_tr6_b96: 1465 case Intrinsic::amdgcn_global_load_tr4_b64: 1466 case Intrinsic::amdgcn_global_load_tr_b64: 1467 case Intrinsic::amdgcn_global_load_tr_b128: 1468 case Intrinsic::amdgcn_ds_read_tr4_b64: 1469 case Intrinsic::amdgcn_ds_read_tr6_b96: 1470 case Intrinsic::amdgcn_ds_read_tr8_b64: 1471 case Intrinsic::amdgcn_ds_read_tr16_b64: { 1472 Info.opc = ISD::INTRINSIC_W_CHAIN; 1473 Info.memVT = MVT::getVT(CI.getType()); 1474 Info.ptrVal = CI.getOperand(0); 1475 Info.align.reset(); 1476 Info.flags |= MachineMemOperand::MOLoad; 1477 return true; 1478 } 1479 case Intrinsic::amdgcn_ds_gws_init: 1480 case Intrinsic::amdgcn_ds_gws_barrier: 1481 case Intrinsic::amdgcn_ds_gws_sema_v: 1482 case Intrinsic::amdgcn_ds_gws_sema_br: 1483 case Intrinsic::amdgcn_ds_gws_sema_p: 1484 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 1485 Info.opc = ISD::INTRINSIC_VOID; 1486 1487 const GCNTargetMachine &TM = 1488 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1489 1490 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1491 Info.ptrVal = MFI->getGWSPSV(TM); 1492 1493 // This is an abstract access, but we need to specify a type and size. 1494 Info.memVT = MVT::i32; 1495 Info.size = 4; 1496 Info.align = Align(4); 1497 1498 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) 1499 Info.flags |= MachineMemOperand::MOLoad; 1500 else 1501 Info.flags |= MachineMemOperand::MOStore; 1502 return true; 1503 } 1504 case Intrinsic::amdgcn_load_to_lds: 1505 case Intrinsic::amdgcn_global_load_lds: { 1506 Info.opc = ISD::INTRINSIC_VOID; 1507 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); 1508 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); 1509 Info.ptrVal = CI.getArgOperand(1); 1510 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1511 return true; 1512 } 1513 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 1514 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: 1515 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: 1516 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: { 1517 Info.opc = ISD::INTRINSIC_W_CHAIN; 1518 1519 const GCNTargetMachine &TM = 1520 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1521 1522 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1523 Info.ptrVal = MFI->getGWSPSV(TM); 1524 1525 // This is an abstract access, but we need to specify a type and size. 1526 Info.memVT = MVT::i32; 1527 Info.size = 4; 1528 Info.align = Align(4); 1529 1530 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1531 return true; 1532 } 1533 case Intrinsic::amdgcn_s_prefetch_data: { 1534 Info.opc = ISD::INTRINSIC_VOID; 1535 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); 1536 Info.ptrVal = CI.getArgOperand(0); 1537 Info.flags |= MachineMemOperand::MOLoad; 1538 return true; 1539 } 1540 default: 1541 return false; 1542 } 1543 } 1544 1545 void SITargetLowering::CollectTargetIntrinsicOperands( 1546 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { 1547 switch (cast<IntrinsicInst>(I).getIntrinsicID()) { 1548 case Intrinsic::amdgcn_addrspacecast_nonnull: { 1549 // The DAG's ValueType loses the addrspaces. 1550 // Add them as 2 extra Constant operands "from" and "to". 1551 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace(); 1552 unsigned DstAS = I.getType()->getPointerAddressSpace(); 1553 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32)); 1554 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32)); 1555 break; 1556 } 1557 default: 1558 break; 1559 } 1560 } 1561 1562 bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, 1563 SmallVectorImpl<Value *> &Ops, 1564 Type *&AccessTy) const { 1565 Value *Ptr = nullptr; 1566 switch (II->getIntrinsicID()) { 1567 case Intrinsic::amdgcn_atomic_cond_sub_u32: 1568 case Intrinsic::amdgcn_ds_append: 1569 case Intrinsic::amdgcn_ds_consume: 1570 case Intrinsic::amdgcn_ds_load_tr8_b64: 1571 case Intrinsic::amdgcn_ds_load_tr16_b128: 1572 case Intrinsic::amdgcn_ds_load_tr4_b64: 1573 case Intrinsic::amdgcn_ds_load_tr6_b96: 1574 case Intrinsic::amdgcn_ds_read_tr4_b64: 1575 case Intrinsic::amdgcn_ds_read_tr6_b96: 1576 case Intrinsic::amdgcn_ds_read_tr8_b64: 1577 case Intrinsic::amdgcn_ds_read_tr16_b64: 1578 case Intrinsic::amdgcn_ds_ordered_add: 1579 case Intrinsic::amdgcn_ds_ordered_swap: 1580 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: 1581 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: 1582 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1583 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1584 case Intrinsic::amdgcn_global_atomic_csub: 1585 case Intrinsic::amdgcn_global_atomic_fmax_num: 1586 case Intrinsic::amdgcn_global_atomic_fmin_num: 1587 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 1588 case Intrinsic::amdgcn_global_load_tr_b64: 1589 case Intrinsic::amdgcn_global_load_tr_b128: 1590 case Intrinsic::amdgcn_global_load_tr4_b64: 1591 case Intrinsic::amdgcn_global_load_tr6_b96: 1592 Ptr = II->getArgOperand(0); 1593 break; 1594 case Intrinsic::amdgcn_load_to_lds: 1595 case Intrinsic::amdgcn_global_load_lds: 1596 Ptr = II->getArgOperand(1); 1597 break; 1598 default: 1599 return false; 1600 } 1601 AccessTy = II->getType(); 1602 Ops.push_back(Ptr); 1603 return true; 1604 } 1605 1606 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, 1607 unsigned AddrSpace) const { 1608 if (!Subtarget->hasFlatInstOffsets()) { 1609 // Flat instructions do not have offsets, and only have the register 1610 // address. 1611 return AM.BaseOffs == 0 && AM.Scale == 0; 1612 } 1613 1614 decltype(SIInstrFlags::FLAT) FlatVariant = 1615 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal 1616 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch 1617 : SIInstrFlags::FLAT; 1618 1619 return AM.Scale == 0 && 1620 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( 1621 AM.BaseOffs, AddrSpace, FlatVariant)); 1622 } 1623 1624 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { 1625 if (Subtarget->hasFlatGlobalInsts()) 1626 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS); 1627 1628 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { 1629 // Assume the we will use FLAT for all global memory accesses 1630 // on VI. 1631 // FIXME: This assumption is currently wrong. On VI we still use 1632 // MUBUF instructions for the r + i addressing mode. As currently 1633 // implemented, the MUBUF instructions only work on buffer < 4GB. 1634 // It may be possible to support > 4GB buffers with MUBUF instructions, 1635 // by setting the stride value in the resource descriptor which would 1636 // increase the size limit to (stride * 4GB). However, this is risky, 1637 // because it has never been validated. 1638 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS); 1639 } 1640 1641 return isLegalMUBUFAddressingMode(AM); 1642 } 1643 1644 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 1645 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 1646 // additionally can do r + r + i with addr64. 32-bit has more addressing 1647 // mode options. Depending on the resource constant, it can also do 1648 // (i64 r0) + (i32 r1) * (i14 i). 1649 // 1650 // Private arrays end up using a scratch buffer most of the time, so also 1651 // assume those use MUBUF instructions. Scratch loads / stores are currently 1652 // implemented as mubuf instructions with offen bit set, so slightly 1653 // different than the normal addr64. 1654 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1655 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs)) 1656 return false; 1657 1658 // FIXME: Since we can split immediate into soffset and immediate offset, 1659 // would it make sense to allow any immediate? 1660 1661 switch (AM.Scale) { 1662 case 0: // r + i or just i, depending on HasBaseReg. 1663 return true; 1664 case 1: 1665 return true; // We have r + r or r + i. 1666 case 2: 1667 if (AM.HasBaseReg) { 1668 // Reject 2 * r + r. 1669 return false; 1670 } 1671 1672 // Allow 2 * r as r + r 1673 // Or 2 * r + i is allowed as r + r + i. 1674 return true; 1675 default: // Don't allow n * r 1676 return false; 1677 } 1678 } 1679 1680 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 1681 const AddrMode &AM, Type *Ty, 1682 unsigned AS, 1683 Instruction *I) const { 1684 // No global is ever allowed as a base. 1685 if (AM.BaseGV) 1686 return false; 1687 1688 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 1689 return isLegalGlobalAddressingMode(AM); 1690 1691 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 1692 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 1693 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || 1694 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) { 1695 // If the offset isn't a multiple of 4, it probably isn't going to be 1696 // correctly aligned. 1697 // FIXME: Can we get the real alignment here? 1698 if (AM.BaseOffs % 4 != 0) 1699 return isLegalMUBUFAddressingMode(AM); 1700 1701 if (!Subtarget->hasScalarSubwordLoads()) { 1702 // There are no SMRD extloads, so if we have to do a small type access we 1703 // will use a MUBUF load. 1704 // FIXME?: We also need to do this if unaligned, but we don't know the 1705 // alignment here. 1706 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) 1707 return isLegalGlobalAddressingMode(AM); 1708 } 1709 1710 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1711 // SMRD instructions have an 8-bit, dword offset on SI. 1712 if (!isUInt<8>(AM.BaseOffs / 4)) 1713 return false; 1714 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { 1715 // On CI+, this can also be a 32-bit literal constant offset. If it fits 1716 // in 8-bits, it can use a smaller encoding. 1717 if (!isUInt<32>(AM.BaseOffs / 4)) 1718 return false; 1719 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { 1720 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 1721 if (!isUInt<20>(AM.BaseOffs)) 1722 return false; 1723 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { 1724 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative 1725 // for S_BUFFER_* instructions). 1726 if (!isInt<21>(AM.BaseOffs)) 1727 return false; 1728 } else { 1729 // On GFX12, all offsets are signed 24-bit in bytes. 1730 if (!isInt<24>(AM.BaseOffs)) 1731 return false; 1732 } 1733 1734 if ((AS == AMDGPUAS::CONSTANT_ADDRESS || 1735 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 1736 AM.BaseOffs < 0) { 1737 // Scalar (non-buffer) loads can only use a negative offset if 1738 // soffset+offset is non-negative. Since the compiler can only prove that 1739 // in a few special cases, it is safer to claim that negative offsets are 1740 // not supported. 1741 return false; 1742 } 1743 1744 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 1745 return true; 1746 1747 if (AM.Scale == 1 && AM.HasBaseReg) 1748 return true; 1749 1750 return false; 1751 } 1752 1753 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 1754 return Subtarget->enableFlatScratch() 1755 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS) 1756 : isLegalMUBUFAddressingMode(AM); 1757 1758 if (AS == AMDGPUAS::LOCAL_ADDRESS || 1759 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { 1760 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 1761 // field. 1762 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 1763 // an 8-bit dword offset but we don't know the alignment here. 1764 if (!isUInt<16>(AM.BaseOffs)) 1765 return false; 1766 1767 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 1768 return true; 1769 1770 if (AM.Scale == 1 && AM.HasBaseReg) 1771 return true; 1772 1773 return false; 1774 } 1775 1776 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { 1777 // For an unknown address space, this usually means that this is for some 1778 // reason being used for pure arithmetic, and not based on some addressing 1779 // computation. We don't have instructions that compute pointers with any 1780 // addressing modes, so treat them as having no offset like flat 1781 // instructions. 1782 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS); 1783 } 1784 1785 // Assume a user alias of global for unknown address spaces. 1786 return isLegalGlobalAddressingMode(AM); 1787 } 1788 1789 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 1790 const MachineFunction &MF) const { 1791 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1792 return (MemVT.getSizeInBits() <= 4 * 32); 1793 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 1794 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); 1795 return (MemVT.getSizeInBits() <= MaxPrivateBits); 1796 } 1797 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 1798 return (MemVT.getSizeInBits() <= 2 * 32); 1799 return true; 1800 } 1801 1802 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( 1803 unsigned Size, unsigned AddrSpace, Align Alignment, 1804 MachineMemOperand::Flags Flags, unsigned *IsFast) const { 1805 if (IsFast) 1806 *IsFast = 0; 1807 1808 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 1809 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 1810 // Check if alignment requirements for ds_read/write instructions are 1811 // disabled. 1812 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) 1813 return false; 1814 1815 Align RequiredAlignment( 1816 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment. 1817 if (Subtarget->hasLDSMisalignedBug() && Size > 32 && 1818 Alignment < RequiredAlignment) 1819 return false; 1820 1821 // Either, the alignment requirements are "enabled", or there is an 1822 // unaligned LDS access related hardware bug though alignment requirements 1823 // are "disabled". In either case, we need to check for proper alignment 1824 // requirements. 1825 // 1826 switch (Size) { 1827 case 64: 1828 // SI has a hardware bug in the LDS / GDS bounds checking: if the base 1829 // address is negative, then the instruction is incorrectly treated as 1830 // out-of-bounds even if base + offsets is in bounds. Split vectorized 1831 // loads here to avoid emitting ds_read2_b32. We may re-combine the 1832 // load later in the SILoadStoreOptimizer. 1833 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) 1834 return false; 1835 1836 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we 1837 // can do a 4 byte aligned, 8 byte access in a single operation using 1838 // ds_read2/write2_b32 with adjacent offsets. 1839 RequiredAlignment = Align(4); 1840 1841 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1842 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ 1843 // ds_write2_b32 depending on the alignment. In either case with either 1844 // alignment there is no faster way of doing this. 1845 1846 // The numbers returned here and below are not additive, it is a 'speed 1847 // rank'. They are just meant to be compared to decide if a certain way 1848 // of lowering an operation is faster than another. For that purpose 1849 // naturally aligned operation gets it bitsize to indicate that "it 1850 // operates with a speed comparable to N-bit wide load". With the full 1851 // alignment ds128 is slower than ds96 for example. If underaligned it 1852 // is comparable to a speed of a single dword access, which would then 1853 // mean 32 < 128 and it is faster to issue a wide load regardless. 1854 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a 1855 // wider load which will not be aligned anymore the latter is slower. 1856 if (IsFast) 1857 *IsFast = (Alignment >= RequiredAlignment) ? 64 1858 : (Alignment < Align(4)) ? 32 1859 : 1; 1860 return true; 1861 } 1862 1863 break; 1864 case 96: 1865 if (!Subtarget->hasDS96AndDS128()) 1866 return false; 1867 1868 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on 1869 // gfx8 and older. 1870 1871 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1872 // Naturally aligned access is fastest. However, also report it is Fast 1873 // if memory is aligned less than DWORD. A narrow load or store will be 1874 // be equally slow as a single ds_read_b96/ds_write_b96, but there will 1875 // be more of them, so overall we will pay less penalty issuing a single 1876 // instruction. 1877 1878 // See comment on the values above. 1879 if (IsFast) 1880 *IsFast = (Alignment >= RequiredAlignment) ? 96 1881 : (Alignment < Align(4)) ? 32 1882 : 1; 1883 return true; 1884 } 1885 1886 break; 1887 case 128: 1888 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) 1889 return false; 1890 1891 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on 1892 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a 1893 // single operation using ds_read2/write2_b64. 1894 RequiredAlignment = Align(8); 1895 1896 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1897 // Naturally aligned access is fastest. However, also report it is Fast 1898 // if memory is aligned less than DWORD. A narrow load or store will be 1899 // be equally slow as a single ds_read_b128/ds_write_b128, but there 1900 // will be more of them, so overall we will pay less penalty issuing a 1901 // single instruction. 1902 1903 // See comment on the values above. 1904 if (IsFast) 1905 *IsFast = (Alignment >= RequiredAlignment) ? 128 1906 : (Alignment < Align(4)) ? 32 1907 : 1; 1908 return true; 1909 } 1910 1911 break; 1912 default: 1913 if (Size > 32) 1914 return false; 1915 1916 break; 1917 } 1918 1919 // See comment on the values above. 1920 // Note that we have a single-dword or sub-dword here, so if underaligned 1921 // it is a slowest possible access, hence returned value is 0. 1922 if (IsFast) 1923 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0; 1924 1925 return Alignment >= RequiredAlignment || 1926 Subtarget->hasUnalignedDSAccessEnabled(); 1927 } 1928 1929 // FIXME: We have to be conservative here and assume that flat operations 1930 // will access scratch. If we had access to the IR function, then we 1931 // could determine if any private memory was used in the function. 1932 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 1933 AddrSpace == AMDGPUAS::FLAT_ADDRESS) { 1934 bool AlignedBy4 = Alignment >= Align(4); 1935 if (IsFast) 1936 *IsFast = AlignedBy4; 1937 1938 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); 1939 } 1940 1941 // So long as they are correct, wide global memory operations perform better 1942 // than multiple smaller memory ops -- even when misaligned 1943 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) { 1944 if (IsFast) 1945 *IsFast = Size; 1946 1947 return Alignment >= Align(4) || 1948 Subtarget->hasUnalignedBufferAccessEnabled(); 1949 } 1950 1951 // Ensure robust out-of-bounds guarantees for buffer accesses are met if 1952 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper 1953 // out-of-bounds behavior, but in the edge case where an access starts 1954 // out-of-bounds and then enter in-bounds, the entire access would be treated 1955 // as out-of-bounds. Prevent misaligned memory accesses by requiring the 1956 // natural alignment of buffer accesses. 1957 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || 1958 AddrSpace == AMDGPUAS::BUFFER_RESOURCE || 1959 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) { 1960 if (!Subtarget->hasRelaxedBufferOOBMode() && 1961 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8)))) 1962 return false; 1963 } 1964 1965 // Smaller than dword value must be aligned. 1966 if (Size < 32) 1967 return false; 1968 1969 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 1970 // byte-address are ignored, thus forcing Dword alignment. 1971 // This applies to private, global, and constant memory. 1972 if (IsFast) 1973 *IsFast = 1; 1974 1975 return Size >= 32 && Alignment >= Align(4); 1976 } 1977 1978 bool SITargetLowering::allowsMisalignedMemoryAccesses( 1979 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1980 unsigned *IsFast) const { 1981 return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, 1982 Alignment, Flags, IsFast); 1983 } 1984 1985 EVT SITargetLowering::getOptimalMemOpType( 1986 LLVMContext &Context, const MemOp &Op, 1987 const AttributeList &FuncAttributes) const { 1988 // FIXME: Should account for address space here. 1989 1990 // The default fallback uses the private pointer size as a guess for a type to 1991 // use. Make sure we switch these to 64-bit accesses. 1992 1993 if (Op.size() >= 16 && 1994 Op.isDstAligned(Align(4))) // XXX: Should only do for global 1995 return MVT::v4i32; 1996 1997 if (Op.size() >= 8 && Op.isDstAligned(Align(4))) 1998 return MVT::v2i32; 1999 2000 // Use the default. 2001 return MVT::Other; 2002 } 2003 2004 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { 2005 const MemSDNode *MemNode = cast<MemSDNode>(N); 2006 return MemNode->getMemOperand()->getFlags() & MONoClobber; 2007 } 2008 2009 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { 2010 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || 2011 AS == AMDGPUAS::PRIVATE_ADDRESS; 2012 } 2013 2014 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, 2015 unsigned DestAS) const { 2016 // Flat -> private/local is a simple truncate. 2017 // Flat -> global is no-op 2018 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) 2019 return true; 2020 2021 const GCNTargetMachine &TM = 2022 static_cast<const GCNTargetMachine &>(getTargetMachine()); 2023 return TM.isNoopAddrSpaceCast(SrcAS, DestAS); 2024 } 2025 2026 TargetLoweringBase::LegalizeTypeAction 2027 SITargetLowering::getPreferredVectorAction(MVT VT) const { 2028 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 2029 VT.getScalarType().bitsLE(MVT::i16)) 2030 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; 2031 return TargetLoweringBase::getPreferredVectorAction(VT); 2032 } 2033 2034 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 2035 Type *Ty) const { 2036 // FIXME: Could be smarter if called for vector constants. 2037 return true; 2038 } 2039 2040 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 2041 unsigned Index) const { 2042 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 2043 return false; 2044 2045 // TODO: Add more cases that are cheap. 2046 return Index == 0; 2047 } 2048 2049 bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const { 2050 // TODO: This should be more aggressive, particular for 16-bit element 2051 // vectors. However there are some mixed improvements and regressions. 2052 EVT EltTy = VT.getVectorElementType(); 2053 return EltTy.getSizeInBits() % 32 == 0; 2054 } 2055 2056 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 2057 if (Subtarget->has16BitInsts() && VT == MVT::i16) { 2058 switch (Op) { 2059 case ISD::LOAD: 2060 case ISD::STORE: 2061 return true; 2062 default: 2063 return false; 2064 } 2065 } 2066 2067 // SimplifySetCC uses this function to determine whether or not it should 2068 // create setcc with i1 operands. We don't have instructions for i1 setcc. 2069 if (VT == MVT::i1 && Op == ISD::SETCC) 2070 return false; 2071 2072 return TargetLowering::isTypeDesirableForOp(Op, VT); 2073 } 2074 2075 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, 2076 const SDLoc &SL, 2077 SDValue Chain, 2078 uint64_t Offset) const { 2079 const DataLayout &DL = DAG.getDataLayout(); 2080 MachineFunction &MF = DAG.getMachineFunction(); 2081 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2082 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 2083 2084 auto [InputPtrReg, RC, ArgTy] = 2085 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2086 2087 // We may not have the kernarg segment argument if we have no kernel 2088 // arguments. 2089 if (!InputPtrReg) 2090 return DAG.getConstant(Offset, SL, PtrVT); 2091 2092 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 2093 SDValue BasePtr = DAG.getCopyFromReg( 2094 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); 2095 2096 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset)); 2097 } 2098 2099 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, 2100 const SDLoc &SL) const { 2101 uint64_t Offset = 2102 getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT); 2103 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); 2104 } 2105 2106 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, 2107 const SDLoc &SL) const { 2108 2109 Function &F = DAG.getMachineFunction().getFunction(); 2110 std::optional<uint32_t> KnownSize = 2111 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 2112 if (KnownSize.has_value()) 2113 return DAG.getConstant(*KnownSize, SL, MVT::i32); 2114 return SDValue(); 2115 } 2116 2117 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, 2118 const SDLoc &SL, SDValue Val, 2119 bool Signed, 2120 const ISD::InputArg *Arg) const { 2121 // First, if it is a widened vector, narrow it. 2122 if (VT.isVector() && 2123 VT.getVectorNumElements() != MemVT.getVectorNumElements()) { 2124 EVT NarrowedVT = 2125 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 2126 VT.getVectorNumElements()); 2127 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, 2128 DAG.getConstant(0, SL, MVT::i32)); 2129 } 2130 2131 // Then convert the vector elements or scalar value. 2132 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { 2133 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; 2134 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); 2135 } 2136 2137 if (MemVT.isFloatingPoint()) 2138 Val = getFPExtOrFPRound(DAG, Val, SL, VT); 2139 else if (Signed) 2140 Val = DAG.getSExtOrTrunc(Val, SL, VT); 2141 else 2142 Val = DAG.getZExtOrTrunc(Val, SL, VT); 2143 2144 return Val; 2145 } 2146 2147 SDValue SITargetLowering::lowerKernargMemParameter( 2148 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, 2149 uint64_t Offset, Align Alignment, bool Signed, 2150 const ISD::InputArg *Arg) const { 2151 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2152 2153 // Try to avoid using an extload by loading earlier than the argument address, 2154 // and extracting the relevant bits. The load should hopefully be merged with 2155 // the previous argument. 2156 if (MemVT.getStoreSize() < 4 && Alignment < 4) { 2157 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). 2158 int64_t AlignDownOffset = alignDown(Offset, 4); 2159 int64_t OffsetDiff = Offset - AlignDownOffset; 2160 2161 EVT IntVT = MemVT.changeTypeToInteger(); 2162 2163 // TODO: If we passed in the base kernel offset we could have a better 2164 // alignment than 4, but we don't really need it. 2165 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); 2166 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), 2167 MachineMemOperand::MODereferenceable | 2168 MachineMemOperand::MOInvariant); 2169 2170 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); 2171 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); 2172 2173 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); 2174 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); 2175 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); 2176 2177 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL); 2178 } 2179 2180 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); 2181 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, 2182 MachineMemOperand::MODereferenceable | 2183 MachineMemOperand::MOInvariant); 2184 2185 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); 2186 return DAG.getMergeValues({Val, Load.getValue(1)}, SL); 2187 } 2188 2189 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, 2190 CCValAssign &VA, const SDLoc &SL, 2191 SDValue Chain, 2192 const ISD::InputArg &Arg) const { 2193 MachineFunction &MF = DAG.getMachineFunction(); 2194 MachineFrameInfo &MFI = MF.getFrameInfo(); 2195 2196 if (Arg.Flags.isByVal()) { 2197 unsigned Size = Arg.Flags.getByValSize(); 2198 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); 2199 return DAG.getFrameIndex(FrameIdx, MVT::i32); 2200 } 2201 2202 unsigned ArgOffset = VA.getLocMemOffset(); 2203 unsigned ArgSize = VA.getValVT().getStoreSize(); 2204 2205 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); 2206 2207 // Create load nodes to retrieve arguments from the stack. 2208 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); 2209 SDValue ArgValue; 2210 2211 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2212 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2213 MVT MemVT = VA.getValVT(); 2214 2215 switch (VA.getLocInfo()) { 2216 default: 2217 break; 2218 case CCValAssign::BCvt: 2219 MemVT = VA.getLocVT(); 2220 break; 2221 case CCValAssign::SExt: 2222 ExtType = ISD::SEXTLOAD; 2223 break; 2224 case CCValAssign::ZExt: 2225 ExtType = ISD::ZEXTLOAD; 2226 break; 2227 case CCValAssign::AExt: 2228 ExtType = ISD::EXTLOAD; 2229 break; 2230 } 2231 2232 ArgValue = DAG.getExtLoad( 2233 ExtType, SL, VA.getLocVT(), Chain, FIN, 2234 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); 2235 return ArgValue; 2236 } 2237 2238 SDValue SITargetLowering::getPreloadedValue( 2239 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT, 2240 AMDGPUFunctionArgInfo::PreloadedValue PVID) const { 2241 const ArgDescriptor *Reg = nullptr; 2242 const TargetRegisterClass *RC; 2243 LLT Ty; 2244 2245 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); 2246 const ArgDescriptor WorkGroupIDX = 2247 ArgDescriptor::createRegister(AMDGPU::TTMP9); 2248 // If GridZ is not programmed in an entry function then the hardware will set 2249 // it to all zeros, so there is no need to mask the GridY value in the low 2250 // order bits. 2251 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 2252 AMDGPU::TTMP7, 2253 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 2254 const ArgDescriptor WorkGroupIDZ = 2255 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 2256 if (Subtarget->hasArchitectedSGPRs() && 2257 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { 2258 switch (PVID) { 2259 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 2260 Reg = &WorkGroupIDX; 2261 RC = &AMDGPU::SReg_32RegClass; 2262 Ty = LLT::scalar(32); 2263 break; 2264 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 2265 Reg = &WorkGroupIDY; 2266 RC = &AMDGPU::SReg_32RegClass; 2267 Ty = LLT::scalar(32); 2268 break; 2269 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 2270 Reg = &WorkGroupIDZ; 2271 RC = &AMDGPU::SReg_32RegClass; 2272 Ty = LLT::scalar(32); 2273 break; 2274 default: 2275 break; 2276 } 2277 } 2278 2279 if (!Reg) 2280 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); 2281 if (!Reg) { 2282 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { 2283 // It's possible for a kernarg intrinsic call to appear in a kernel with 2284 // no allocated segment, in which case we do not add the user sgpr 2285 // argument, so just return null. 2286 return DAG.getConstant(0, SDLoc(), VT); 2287 } 2288 2289 // It's undefined behavior if a function marked with the amdgpu-no-* 2290 // attributes uses the corresponding intrinsic. 2291 return DAG.getPOISON(VT); 2292 } 2293 2294 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg); 2295 } 2296 2297 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, 2298 CallingConv::ID CallConv, 2299 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, 2300 FunctionType *FType, 2301 SIMachineFunctionInfo *Info) { 2302 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { 2303 const ISD::InputArg *Arg = &Ins[I]; 2304 2305 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && 2306 "vector type argument should have been split"); 2307 2308 // First check if it's a PS input addr. 2309 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && 2310 PSInputNum <= 15) { 2311 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); 2312 2313 // Inconveniently only the first part of the split is marked as isSplit, 2314 // so skip to the end. We only want to increment PSInputNum once for the 2315 // entire split argument. 2316 if (Arg->Flags.isSplit()) { 2317 while (!Arg->Flags.isSplitEnd()) { 2318 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && 2319 "unexpected vector split in ps argument type"); 2320 if (!SkipArg) 2321 Splits.push_back(*Arg); 2322 Arg = &Ins[++I]; 2323 } 2324 } 2325 2326 if (SkipArg) { 2327 // We can safely skip PS inputs. 2328 Skipped.set(Arg->getOrigArgIndex()); 2329 ++PSInputNum; 2330 continue; 2331 } 2332 2333 Info->markPSInputAllocated(PSInputNum); 2334 if (Arg->Used) 2335 Info->markPSInputEnabled(PSInputNum); 2336 2337 ++PSInputNum; 2338 } 2339 2340 Splits.push_back(*Arg); 2341 } 2342 } 2343 2344 // Allocate special inputs passed in VGPRs. 2345 void SITargetLowering::allocateSpecialEntryInputVGPRs( 2346 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2347 SIMachineFunctionInfo &Info) const { 2348 const LLT S32 = LLT::scalar(32); 2349 MachineRegisterInfo &MRI = MF.getRegInfo(); 2350 2351 if (Info.hasWorkItemIDX()) { 2352 Register Reg = AMDGPU::VGPR0; 2353 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2354 2355 CCInfo.AllocateReg(Reg); 2356 unsigned Mask = 2357 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u; 2358 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); 2359 } 2360 2361 if (Info.hasWorkItemIDY()) { 2362 assert(Info.hasWorkItemIDX()); 2363 if (Subtarget->hasPackedTID()) { 2364 Info.setWorkItemIDY( 2365 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10)); 2366 } else { 2367 unsigned Reg = AMDGPU::VGPR1; 2368 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2369 2370 CCInfo.AllocateReg(Reg); 2371 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); 2372 } 2373 } 2374 2375 if (Info.hasWorkItemIDZ()) { 2376 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); 2377 if (Subtarget->hasPackedTID()) { 2378 Info.setWorkItemIDZ( 2379 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20)); 2380 } else { 2381 unsigned Reg = AMDGPU::VGPR2; 2382 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2383 2384 CCInfo.AllocateReg(Reg); 2385 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); 2386 } 2387 } 2388 } 2389 2390 // Try to allocate a VGPR at the end of the argument list, or if no argument 2391 // VGPRs are left allocating a stack slot. 2392 // If \p Mask is is given it indicates bitfield position in the register. 2393 // If \p Arg is given use it with new ]p Mask instead of allocating new. 2394 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, 2395 ArgDescriptor Arg = ArgDescriptor()) { 2396 if (Arg.isSet()) 2397 return ArgDescriptor::createArg(Arg, Mask); 2398 2399 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); 2400 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); 2401 if (RegIdx == ArgVGPRs.size()) { 2402 // Spill to stack required. 2403 int64_t Offset = CCInfo.AllocateStack(4, Align(4)); 2404 2405 return ArgDescriptor::createStack(Offset, Mask); 2406 } 2407 2408 unsigned Reg = ArgVGPRs[RegIdx]; 2409 Reg = CCInfo.AllocateReg(Reg); 2410 assert(Reg != AMDGPU::NoRegister); 2411 2412 MachineFunction &MF = CCInfo.getMachineFunction(); 2413 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 2414 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); 2415 return ArgDescriptor::createRegister(Reg, Mask); 2416 } 2417 2418 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, 2419 const TargetRegisterClass *RC, 2420 unsigned NumArgRegs) { 2421 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32); 2422 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); 2423 if (RegIdx == ArgSGPRs.size()) 2424 report_fatal_error("ran out of SGPRs for arguments"); 2425 2426 unsigned Reg = ArgSGPRs[RegIdx]; 2427 Reg = CCInfo.AllocateReg(Reg); 2428 assert(Reg != AMDGPU::NoRegister); 2429 2430 MachineFunction &MF = CCInfo.getMachineFunction(); 2431 MF.addLiveIn(Reg, RC); 2432 return ArgDescriptor::createRegister(Reg); 2433 } 2434 2435 // If this has a fixed position, we still should allocate the register in the 2436 // CCInfo state. Technically we could get away with this for values passed 2437 // outside of the normal argument range. 2438 static void allocateFixedSGPRInputImpl(CCState &CCInfo, 2439 const TargetRegisterClass *RC, 2440 MCRegister Reg) { 2441 Reg = CCInfo.AllocateReg(Reg); 2442 assert(Reg != AMDGPU::NoRegister); 2443 MachineFunction &MF = CCInfo.getMachineFunction(); 2444 MF.addLiveIn(Reg, RC); 2445 } 2446 2447 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { 2448 if (Arg) { 2449 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 2450 Arg.getRegister()); 2451 } else 2452 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); 2453 } 2454 2455 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { 2456 if (Arg) { 2457 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 2458 Arg.getRegister()); 2459 } else 2460 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); 2461 } 2462 2463 /// Allocate implicit function VGPR arguments at the end of allocated user 2464 /// arguments. 2465 void SITargetLowering::allocateSpecialInputVGPRs( 2466 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2467 SIMachineFunctionInfo &Info) const { 2468 const unsigned Mask = 0x3ff; 2469 ArgDescriptor Arg; 2470 2471 if (Info.hasWorkItemIDX()) { 2472 Arg = allocateVGPR32Input(CCInfo, Mask); 2473 Info.setWorkItemIDX(Arg); 2474 } 2475 2476 if (Info.hasWorkItemIDY()) { 2477 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); 2478 Info.setWorkItemIDY(Arg); 2479 } 2480 2481 if (Info.hasWorkItemIDZ()) 2482 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); 2483 } 2484 2485 /// Allocate implicit function VGPR arguments in fixed registers. 2486 void SITargetLowering::allocateSpecialInputVGPRsFixed( 2487 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2488 SIMachineFunctionInfo &Info) const { 2489 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); 2490 if (!Reg) 2491 report_fatal_error("failed to allocate VGPR for implicit arguments"); 2492 2493 const unsigned Mask = 0x3ff; 2494 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); 2495 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); 2496 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); 2497 } 2498 2499 void SITargetLowering::allocateSpecialInputSGPRs( 2500 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2501 SIMachineFunctionInfo &Info) const { 2502 auto &ArgInfo = Info.getArgInfo(); 2503 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); 2504 2505 // TODO: Unify handling with private memory pointers. 2506 if (UserSGPRInfo.hasDispatchPtr()) 2507 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); 2508 2509 if (UserSGPRInfo.hasQueuePtr()) 2510 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); 2511 2512 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a 2513 // constant offset from the kernarg segment. 2514 if (Info.hasImplicitArgPtr()) 2515 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); 2516 2517 if (UserSGPRInfo.hasDispatchID()) 2518 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); 2519 2520 // flat_scratch_init is not applicable for non-kernel functions. 2521 2522 if (Info.hasWorkGroupIDX()) 2523 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); 2524 2525 if (Info.hasWorkGroupIDY()) 2526 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); 2527 2528 if (Info.hasWorkGroupIDZ()) 2529 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); 2530 2531 if (Info.hasLDSKernelId()) 2532 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId); 2533 } 2534 2535 // Allocate special inputs passed in user SGPRs. 2536 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, 2537 MachineFunction &MF, 2538 const SIRegisterInfo &TRI, 2539 SIMachineFunctionInfo &Info) const { 2540 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); 2541 if (UserSGPRInfo.hasImplicitBufferPtr()) { 2542 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); 2543 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 2544 CCInfo.AllocateReg(ImplicitBufferPtrReg); 2545 } 2546 2547 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 2548 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 2549 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 2550 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 2551 CCInfo.AllocateReg(PrivateSegmentBufferReg); 2552 } 2553 2554 if (UserSGPRInfo.hasDispatchPtr()) { 2555 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 2556 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 2557 CCInfo.AllocateReg(DispatchPtrReg); 2558 } 2559 2560 if (UserSGPRInfo.hasQueuePtr()) { 2561 Register QueuePtrReg = Info.addQueuePtr(TRI); 2562 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 2563 CCInfo.AllocateReg(QueuePtrReg); 2564 } 2565 2566 if (UserSGPRInfo.hasKernargSegmentPtr()) { 2567 MachineRegisterInfo &MRI = MF.getRegInfo(); 2568 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 2569 CCInfo.AllocateReg(InputPtrReg); 2570 2571 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); 2572 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2573 } 2574 2575 if (UserSGPRInfo.hasDispatchID()) { 2576 Register DispatchIDReg = Info.addDispatchID(TRI); 2577 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 2578 CCInfo.AllocateReg(DispatchIDReg); 2579 } 2580 2581 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { 2582 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 2583 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 2584 CCInfo.AllocateReg(FlatScratchInitReg); 2585 } 2586 2587 if (UserSGPRInfo.hasPrivateSegmentSize()) { 2588 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI); 2589 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass); 2590 CCInfo.AllocateReg(PrivateSegmentSizeReg); 2591 } 2592 2593 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 2594 // these from the dispatch pointer. 2595 } 2596 2597 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be 2598 // sequential starting from the first argument. 2599 void SITargetLowering::allocatePreloadKernArgSGPRs( 2600 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, 2601 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, 2602 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { 2603 Function &F = MF.getFunction(); 2604 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); 2605 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); 2606 bool InPreloadSequence = true; 2607 unsigned InIdx = 0; 2608 bool AlignedForImplictArgs = false; 2609 unsigned ImplicitArgOffset = 0; 2610 for (auto &Arg : F.args()) { 2611 if (!InPreloadSequence || !Arg.hasInRegAttr()) 2612 break; 2613 2614 unsigned ArgIdx = Arg.getArgNo(); 2615 // Don't preload non-original args or parts not in the current preload 2616 // sequence. 2617 if (InIdx < Ins.size() && 2618 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx)) 2619 break; 2620 2621 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && 2622 Ins[InIdx].getOrigArgIndex() == ArgIdx; 2623 InIdx++) { 2624 assert(ArgLocs[ArgIdx].isMemLoc()); 2625 auto &ArgLoc = ArgLocs[InIdx]; 2626 const Align KernelArgBaseAlign = Align(16); 2627 unsigned ArgOffset = ArgLoc.getLocMemOffset(); 2628 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); 2629 unsigned NumAllocSGPRs = 2630 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; 2631 2632 // Fix alignment for hidden arguments. 2633 if (Arg.hasAttribute("amdgpu-hidden-argument")) { 2634 if (!AlignedForImplictArgs) { 2635 ImplicitArgOffset = 2636 alignTo(LastExplicitArgOffset, 2637 Subtarget->getAlignmentForImplicitArgPtr()) - 2638 LastExplicitArgOffset; 2639 AlignedForImplictArgs = true; 2640 } 2641 ArgOffset += ImplicitArgOffset; 2642 } 2643 2644 // Arg is preloaded into the previous SGPR. 2645 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { 2646 assert(InIdx >= 1 && "No previous SGPR"); 2647 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( 2648 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); 2649 continue; 2650 } 2651 2652 unsigned Padding = ArgOffset - LastExplicitArgOffset; 2653 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; 2654 // Check for free user SGPRs for preloading. 2655 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) { 2656 InPreloadSequence = false; 2657 break; 2658 } 2659 2660 // Preload this argument. 2661 const TargetRegisterClass *RC = 2662 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); 2663 SmallVectorImpl<MCRegister> *PreloadRegs = 2664 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); 2665 2666 if (PreloadRegs->size() > 1) 2667 RC = &AMDGPU::SGPR_32RegClass; 2668 for (auto &Reg : *PreloadRegs) { 2669 assert(Reg); 2670 MF.addLiveIn(Reg, RC); 2671 CCInfo.AllocateReg(Reg); 2672 } 2673 2674 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; 2675 } 2676 } 2677 } 2678 2679 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, 2680 const SIRegisterInfo &TRI, 2681 SIMachineFunctionInfo &Info) const { 2682 // Always allocate this last since it is a synthetic preload. 2683 if (Info.hasLDSKernelId()) { 2684 Register Reg = Info.addLDSKernelId(); 2685 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2686 CCInfo.AllocateReg(Reg); 2687 } 2688 } 2689 2690 // Allocate special input registers that are initialized per-wave. 2691 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, 2692 SIMachineFunctionInfo &Info, 2693 CallingConv::ID CallConv, 2694 bool IsShader) const { 2695 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); 2696 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { 2697 // Note: user SGPRs are handled by the front-end for graphics shaders 2698 // Pad up the used user SGPRs with dead inputs. 2699 2700 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately 2701 // before enabling architected SGPRs for workgroup IDs. 2702 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget"); 2703 2704 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); 2705 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to 2706 // rely on it to reach 16 since if we end up having no stack usage, it will 2707 // not really be added. 2708 unsigned NumRequiredSystemSGPRs = 2709 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() + 2710 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo(); 2711 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { 2712 Register Reg = Info.addReservedUserSGPR(); 2713 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2714 CCInfo.AllocateReg(Reg); 2715 } 2716 } 2717 2718 if (!HasArchitectedSGPRs) { 2719 if (Info.hasWorkGroupIDX()) { 2720 Register Reg = Info.addWorkGroupIDX(); 2721 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2722 CCInfo.AllocateReg(Reg); 2723 } 2724 2725 if (Info.hasWorkGroupIDY()) { 2726 Register Reg = Info.addWorkGroupIDY(); 2727 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2728 CCInfo.AllocateReg(Reg); 2729 } 2730 2731 if (Info.hasWorkGroupIDZ()) { 2732 Register Reg = Info.addWorkGroupIDZ(); 2733 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2734 CCInfo.AllocateReg(Reg); 2735 } 2736 } 2737 2738 if (Info.hasWorkGroupInfo()) { 2739 Register Reg = Info.addWorkGroupInfo(); 2740 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2741 CCInfo.AllocateReg(Reg); 2742 } 2743 2744 if (Info.hasPrivateSegmentWaveByteOffset()) { 2745 // Scratch wave offset passed in system SGPR. 2746 unsigned PrivateSegmentWaveByteOffsetReg; 2747 2748 if (IsShader) { 2749 PrivateSegmentWaveByteOffsetReg = 2750 Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); 2751 2752 // This is true if the scratch wave byte offset doesn't have a fixed 2753 // location. 2754 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { 2755 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 2756 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 2757 } 2758 } else 2759 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); 2760 2761 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 2762 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 2763 } 2764 2765 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || 2766 Info.getNumPreloadedSGPRs() >= 16); 2767 } 2768 2769 static void reservePrivateMemoryRegs(const TargetMachine &TM, 2770 MachineFunction &MF, 2771 const SIRegisterInfo &TRI, 2772 SIMachineFunctionInfo &Info) { 2773 // Now that we've figured out where the scratch register inputs are, see if 2774 // should reserve the arguments and use them directly. 2775 MachineFrameInfo &MFI = MF.getFrameInfo(); 2776 bool HasStackObjects = MFI.hasStackObjects(); 2777 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2778 2779 // Record that we know we have non-spill stack objects so we don't need to 2780 // check all stack objects later. 2781 if (HasStackObjects) 2782 Info.setHasNonSpillStackObjects(true); 2783 2784 // Everything live out of a block is spilled with fast regalloc, so it's 2785 // almost certain that spilling will be required. 2786 if (TM.getOptLevel() == CodeGenOptLevel::None) 2787 HasStackObjects = true; 2788 2789 // For now assume stack access is needed in any callee functions, so we need 2790 // the scratch registers to pass in. 2791 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); 2792 2793 if (!ST.enableFlatScratch()) { 2794 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { 2795 // If we have stack objects, we unquestionably need the private buffer 2796 // resource. For the Code Object V2 ABI, this will be the first 4 user 2797 // SGPR inputs. We can reserve those and use them directly. 2798 2799 Register PrivateSegmentBufferReg = 2800 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 2801 Info.setScratchRSrcReg(PrivateSegmentBufferReg); 2802 } else { 2803 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); 2804 // We tentatively reserve the last registers (skipping the last registers 2805 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, 2806 // we'll replace these with the ones immediately after those which were 2807 // really allocated. In the prologue copies will be inserted from the 2808 // argument to these reserved registers. 2809 2810 // Without HSA, relocations are used for the scratch pointer and the 2811 // buffer resource setup is always inserted in the prologue. Scratch wave 2812 // offset is still in an input SGPR. 2813 Info.setScratchRSrcReg(ReservedBufferReg); 2814 } 2815 } 2816 2817 MachineRegisterInfo &MRI = MF.getRegInfo(); 2818 2819 // For entry functions we have to set up the stack pointer if we use it, 2820 // whereas non-entry functions get this "for free". This means there is no 2821 // intrinsic advantage to using S32 over S34 in cases where we do not have 2822 // calls but do need a frame pointer (i.e. if we are requested to have one 2823 // because frame pointer elimination is disabled). To keep things simple we 2824 // only ever use S32 as the call ABI stack pointer, and so using it does not 2825 // imply we need a separate frame pointer. 2826 // 2827 // Try to use s32 as the SP, but move it if it would interfere with input 2828 // arguments. This won't work with calls though. 2829 // 2830 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input 2831 // registers. 2832 if (!MRI.isLiveIn(AMDGPU::SGPR32)) { 2833 Info.setStackPtrOffsetReg(AMDGPU::SGPR32); 2834 } else { 2835 assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); 2836 2837 if (MFI.hasCalls()) 2838 report_fatal_error("call in graphics shader with too many input SGPRs"); 2839 2840 for (unsigned Reg : AMDGPU::SGPR_32RegClass) { 2841 if (!MRI.isLiveIn(Reg)) { 2842 Info.setStackPtrOffsetReg(Reg); 2843 break; 2844 } 2845 } 2846 2847 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) 2848 report_fatal_error("failed to find register for SP"); 2849 } 2850 2851 // hasFP should be accurate for entry functions even before the frame is 2852 // finalized, because it does not rely on the known stack size, only 2853 // properties like whether variable sized objects are present. 2854 if (ST.getFrameLowering()->hasFP(MF)) { 2855 Info.setFrameOffsetReg(AMDGPU::SGPR33); 2856 } 2857 } 2858 2859 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { 2860 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2861 return !Info->isEntryFunction(); 2862 } 2863 2864 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {} 2865 2866 void SITargetLowering::insertCopiesSplitCSR( 2867 MachineBasicBlock *Entry, 2868 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 2869 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2870 2871 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 2872 if (!IStart) 2873 return; 2874 2875 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2876 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 2877 MachineBasicBlock::iterator MBBI = Entry->begin(); 2878 for (const MCPhysReg *I = IStart; *I; ++I) { 2879 const TargetRegisterClass *RC = nullptr; 2880 if (AMDGPU::SReg_64RegClass.contains(*I)) 2881 RC = &AMDGPU::SGPR_64RegClass; 2882 else if (AMDGPU::SReg_32RegClass.contains(*I)) 2883 RC = &AMDGPU::SGPR_32RegClass; 2884 else 2885 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2886 2887 Register NewVR = MRI->createVirtualRegister(RC); 2888 // Create copy from CSR to a virtual register. 2889 Entry->addLiveIn(*I); 2890 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 2891 .addReg(*I); 2892 2893 // Insert the copy-back instructions right before the terminator. 2894 for (auto *Exit : Exits) 2895 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 2896 TII->get(TargetOpcode::COPY), *I) 2897 .addReg(NewVR); 2898 } 2899 } 2900 2901 SDValue SITargetLowering::LowerFormalArguments( 2902 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2903 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2904 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2905 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2906 2907 MachineFunction &MF = DAG.getMachineFunction(); 2908 const Function &Fn = MF.getFunction(); 2909 FunctionType *FType = MF.getFunction().getFunctionType(); 2910 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2911 bool IsError = false; 2912 2913 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { 2914 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 2915 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc())); 2916 IsError = true; 2917 } 2918 2919 SmallVector<ISD::InputArg, 16> Splits; 2920 SmallVector<CCValAssign, 16> ArgLocs; 2921 BitVector Skipped(Ins.size()); 2922 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2923 *DAG.getContext()); 2924 2925 bool IsGraphics = AMDGPU::isGraphics(CallConv); 2926 bool IsKernel = AMDGPU::isKernel(CallConv); 2927 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); 2928 2929 if (IsGraphics) { 2930 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); 2931 assert(!UserSGPRInfo.hasDispatchPtr() && 2932 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && 2933 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && 2934 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); 2935 (void)UserSGPRInfo; 2936 if (!Subtarget->enableFlatScratch()) 2937 assert(!UserSGPRInfo.hasFlatScratchInit()); 2938 if ((CallConv != CallingConv::AMDGPU_CS && 2939 CallConv != CallingConv::AMDGPU_Gfx) || 2940 !Subtarget->hasArchitectedSGPRs()) 2941 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 2942 !Info->hasWorkGroupIDZ()); 2943 } 2944 2945 if (CallConv == CallingConv::AMDGPU_PS) { 2946 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); 2947 2948 // At least one interpolation mode must be enabled or else the GPU will 2949 // hang. 2950 // 2951 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 2952 // set PSInputAddr, the user wants to enable some bits after the compilation 2953 // based on run-time states. Since we can't know what the final PSInputEna 2954 // will look like, so we shouldn't do anything here and the user should take 2955 // responsibility for the correct programming. 2956 // 2957 // Otherwise, the following restrictions apply: 2958 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 2959 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 2960 // enabled too. 2961 if ((Info->getPSInputAddr() & 0x7F) == 0 || 2962 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { 2963 CCInfo.AllocateReg(AMDGPU::VGPR0); 2964 CCInfo.AllocateReg(AMDGPU::VGPR1); 2965 Info->markPSInputAllocated(0); 2966 Info->markPSInputEnabled(0); 2967 } 2968 if (Subtarget->isAmdPalOS()) { 2969 // For isAmdPalOS, the user does not enable some bits after compilation 2970 // based on run-time states; the register values being generated here are 2971 // the final ones set in hardware. Therefore we need to apply the 2972 // workaround to PSInputAddr and PSInputEnable together. (The case where 2973 // a bit is set in PSInputAddr but not PSInputEnable is where the 2974 // frontend set up an input arg for a particular interpolation mode, but 2975 // nothing uses that input arg. Really we should have an earlier pass 2976 // that removes such an arg.) 2977 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 2978 if ((PsInputBits & 0x7F) == 0 || 2979 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) 2980 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr())); 2981 } 2982 } else if (IsKernel) { 2983 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 2984 } else { 2985 Splits.append(Ins.begin(), Ins.end()); 2986 } 2987 2988 if (IsKernel) 2989 analyzeFormalArgumentsCompute(CCInfo, Ins); 2990 2991 if (IsEntryFunc) { 2992 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 2993 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); 2994 if (IsKernel && Subtarget->hasKernargPreload()) 2995 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); 2996 2997 allocateLDSKernelId(CCInfo, MF, *TRI, *Info); 2998 } else if (!IsGraphics) { 2999 // For the fixed ABI, pass workitem IDs in the last argument register. 3000 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 3001 3002 // FIXME: Sink this into allocateSpecialInputSGPRs 3003 if (!Subtarget->enableFlatScratch()) 3004 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 3005 3006 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 3007 } 3008 3009 if (!IsKernel) { 3010 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); 3011 CCInfo.AnalyzeFormalArguments(Splits, AssignFn); 3012 } 3013 3014 SmallVector<SDValue, 16> Chains; 3015 3016 // FIXME: This is the minimum kernel argument alignment. We should improve 3017 // this to the maximum alignment of the arguments. 3018 // 3019 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit 3020 // kern arg offset. 3021 const Align KernelArgBaseAlign = Align(16); 3022 3023 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 3024 const ISD::InputArg &Arg = Ins[i]; 3025 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) { 3026 InVals.push_back(DAG.getPOISON(Arg.VT)); 3027 continue; 3028 } 3029 3030 CCValAssign &VA = ArgLocs[ArgIdx++]; 3031 MVT VT = VA.getLocVT(); 3032 3033 if (IsEntryFunc && VA.isMemLoc()) { 3034 VT = Ins[i].VT; 3035 EVT MemVT = VA.getLocVT(); 3036 3037 const uint64_t Offset = VA.getLocMemOffset(); 3038 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); 3039 3040 if (Arg.Flags.isByRef()) { 3041 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); 3042 3043 const GCNTargetMachine &TM = 3044 static_cast<const GCNTargetMachine &>(getTargetMachine()); 3045 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, 3046 Arg.Flags.getPointerAddrSpace())) { 3047 Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, 3048 Arg.Flags.getPointerAddrSpace()); 3049 } 3050 3051 InVals.push_back(Ptr); 3052 continue; 3053 } 3054 3055 SDValue NewArg; 3056 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { 3057 if (MemVT.getStoreSize() < 4 && Alignment < 4) { 3058 // In this case the argument is packed into the previous preload SGPR. 3059 int64_t AlignDownOffset = alignDown(Offset, 4); 3060 int64_t OffsetDiff = Offset - AlignDownOffset; 3061 EVT IntVT = MemVT.changeTypeToInteger(); 3062 3063 const SIMachineFunctionInfo *Info = 3064 MF.getInfo<SIMachineFunctionInfo>(); 3065 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 3066 Register Reg = 3067 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; 3068 3069 assert(Reg); 3070 Register VReg = MRI.getLiveInVirtReg(Reg); 3071 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); 3072 3073 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); 3074 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); 3075 3076 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); 3077 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); 3078 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, 3079 Ins[i].Flags.isSExt(), &Ins[i]); 3080 3081 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); 3082 } else { 3083 const SIMachineFunctionInfo *Info = 3084 MF.getInfo<SIMachineFunctionInfo>(); 3085 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 3086 const SmallVectorImpl<MCRegister> &PreloadRegs = 3087 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; 3088 3089 SDValue Copy; 3090 if (PreloadRegs.size() == 1) { 3091 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); 3092 const TargetRegisterClass *RC = MRI.getRegClass(VReg); 3093 NewArg = DAG.getCopyFromReg( 3094 Chain, DL, VReg, 3095 EVT::getIntegerVT(*DAG.getContext(), 3096 TRI->getRegSizeInBits(*RC))); 3097 3098 } else { 3099 // If the kernarg alignment does not match the alignment of the SGPR 3100 // tuple RC that can accommodate this argument, it will be built up 3101 // via copies from from the individual SGPRs that the argument was 3102 // preloaded to. 3103 SmallVector<SDValue, 4> Elts; 3104 for (auto Reg : PreloadRegs) { 3105 Register VReg = MRI.getLiveInVirtReg(Reg); 3106 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); 3107 Elts.push_back(Copy); 3108 } 3109 NewArg = 3110 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, 3111 PreloadRegs.size()), 3112 DL, Elts); 3113 } 3114 3115 // If the argument was preloaded to multiple consecutive 32-bit 3116 // registers because of misalignment between addressable SGPR tuples 3117 // and the argument size, we can still assume that because of kernarg 3118 // segment alignment restrictions that NewArg's size is the same as 3119 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a 3120 // truncate since we cannot preload to less than a single SGPR and the 3121 // MemVT may be smaller. 3122 EVT MemVTInt = 3123 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 3124 if (MemVT.bitsLT(NewArg.getSimpleValueType())) 3125 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg); 3126 3127 NewArg = DAG.getBitcast(MemVT, NewArg); 3128 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg, 3129 Ins[i].Flags.isSExt(), &Ins[i]); 3130 NewArg = DAG.getMergeValues({NewArg, Chain}, DL); 3131 } 3132 } else { 3133 // Hidden arguments that are in the kernel signature must be preloaded 3134 // to user SGPRs. Print a diagnostic error if a hidden argument is in 3135 // the argument list and is not preloaded. 3136 if (Arg.isOrigArg()) { 3137 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex()); 3138 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) { 3139 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 3140 *OrigArg->getParent(), 3141 "hidden argument in kernel signature was not preloaded", 3142 DL.getDebugLoc())); 3143 } 3144 } 3145 3146 NewArg = 3147 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, 3148 Alignment, Ins[i].Flags.isSExt(), &Ins[i]); 3149 } 3150 Chains.push_back(NewArg.getValue(1)); 3151 3152 auto *ParamTy = 3153 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 3154 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 3155 ParamTy && 3156 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 3157 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { 3158 // On SI local pointers are just offsets into LDS, so they are always 3159 // less than 16-bits. On CI and newer they could potentially be 3160 // real pointers, so we can't guarantee their size. 3161 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, 3162 DAG.getValueType(MVT::i16)); 3163 } 3164 3165 InVals.push_back(NewArg); 3166 continue; 3167 } 3168 if (!IsEntryFunc && VA.isMemLoc()) { 3169 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); 3170 InVals.push_back(Val); 3171 if (!Arg.Flags.isByVal()) 3172 Chains.push_back(Val.getValue(1)); 3173 continue; 3174 } 3175 3176 assert(VA.isRegLoc() && "Parameter must be in a register!"); 3177 3178 Register Reg = VA.getLocReg(); 3179 const TargetRegisterClass *RC = nullptr; 3180 if (AMDGPU::VGPR_32RegClass.contains(Reg)) 3181 RC = &AMDGPU::VGPR_32RegClass; 3182 else if (AMDGPU::SGPR_32RegClass.contains(Reg)) 3183 RC = &AMDGPU::SGPR_32RegClass; 3184 else 3185 llvm_unreachable("Unexpected register class in LowerFormalArguments!"); 3186 EVT ValVT = VA.getValVT(); 3187 3188 Reg = MF.addLiveIn(Reg, RC); 3189 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 3190 3191 if (Arg.Flags.isSRet()) { 3192 // The return object should be reasonably addressable. 3193 3194 // FIXME: This helps when the return is a real sret. If it is a 3195 // automatically inserted sret (i.e. CanLowerReturn returns false), an 3196 // extra copy is inserted in SelectionDAGBuilder which obscures this. 3197 unsigned NumBits = 3198 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); 3199 Val = DAG.getNode( 3200 ISD::AssertZext, DL, VT, Val, 3201 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); 3202 } 3203 3204 // If this is an 8 or 16-bit value, it is really passed promoted 3205 // to 32 bits. Insert an assert[sz]ext to capture this, then 3206 // truncate to the right size. 3207 switch (VA.getLocInfo()) { 3208 case CCValAssign::Full: 3209 break; 3210 case CCValAssign::BCvt: 3211 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); 3212 break; 3213 case CCValAssign::SExt: 3214 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT)); 3215 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3216 break; 3217 case CCValAssign::ZExt: 3218 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT)); 3219 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3220 break; 3221 case CCValAssign::AExt: 3222 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3223 break; 3224 default: 3225 llvm_unreachable("Unknown loc info!"); 3226 } 3227 3228 InVals.push_back(Val); 3229 } 3230 3231 // Start adding system SGPRs. 3232 if (IsEntryFunc) 3233 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); 3234 3235 // DAG.getPass() returns nullptr when using new pass manager. 3236 // TODO: Use DAG.getMFAM() to access analysis result. 3237 if (DAG.getPass()) { 3238 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 3239 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); 3240 } 3241 3242 unsigned StackArgSize = CCInfo.getStackSize(); 3243 Info->setBytesInStackArgArea(StackArgSize); 3244 3245 return Chains.empty() ? Chain 3246 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 3247 } 3248 3249 // TODO: If return values can't fit in registers, we should return as many as 3250 // possible in registers before passing on stack. 3251 bool SITargetLowering::CanLowerReturn( 3252 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, 3253 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context, 3254 const Type *RetTy) const { 3255 // Replacing returns with sret/stack usage doesn't make sense for shaders. 3256 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn 3257 // for shaders. Vector types should be explicitly handled by CC. 3258 if (AMDGPU::isEntryFunctionCC(CallConv)) 3259 return true; 3260 3261 SmallVector<CCValAssign, 16> RVLocs; 3262 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); 3263 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg))) 3264 return false; 3265 3266 // We must use the stack if return would require unavailable registers. 3267 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); 3268 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 3269 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) 3270 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i))) 3271 return false; 3272 3273 return true; 3274 } 3275 3276 SDValue 3277 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3278 bool isVarArg, 3279 const SmallVectorImpl<ISD::OutputArg> &Outs, 3280 const SmallVectorImpl<SDValue> &OutVals, 3281 const SDLoc &DL, SelectionDAG &DAG) const { 3282 MachineFunction &MF = DAG.getMachineFunction(); 3283 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3284 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 3285 3286 if (AMDGPU::isKernel(CallConv)) { 3287 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 3288 OutVals, DL, DAG); 3289 } 3290 3291 bool IsShader = AMDGPU::isShader(CallConv); 3292 3293 Info->setIfReturnsVoid(Outs.empty()); 3294 bool IsWaveEnd = Info->returnsVoid() && IsShader; 3295 3296 // CCValAssign - represent the assignment of the return value to a location. 3297 SmallVector<CCValAssign, 48> RVLocs; 3298 3299 // CCState - Info about the registers and stack slots. 3300 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3301 *DAG.getContext()); 3302 3303 // Analyze outgoing return values. 3304 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3305 3306 SDValue Glue; 3307 SmallVector<SDValue, 48> RetOps; 3308 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 3309 3310 SDValue ReadFirstLane = 3311 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); 3312 // Copy the result values into the output registers. 3313 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; 3314 ++I, ++RealRVLocIdx) { 3315 CCValAssign &VA = RVLocs[I]; 3316 assert(VA.isRegLoc() && "Can only return in registers!"); 3317 // TODO: Partially return in registers if return values don't fit. 3318 SDValue Arg = OutVals[RealRVLocIdx]; 3319 3320 // Copied from other backends. 3321 switch (VA.getLocInfo()) { 3322 case CCValAssign::Full: 3323 break; 3324 case CCValAssign::BCvt: 3325 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3326 break; 3327 case CCValAssign::SExt: 3328 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3329 break; 3330 case CCValAssign::ZExt: 3331 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3332 break; 3333 case CCValAssign::AExt: 3334 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3335 break; 3336 default: 3337 llvm_unreachable("Unknown loc info!"); 3338 } 3339 if (TRI->isSGPRPhysReg(VA.getLocReg())) 3340 Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(), 3341 ReadFirstLane, Arg); 3342 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue); 3343 Glue = Chain.getValue(1); 3344 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3345 } 3346 3347 // FIXME: Does sret work properly? 3348 if (!Info->isEntryFunction()) { 3349 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3350 const MCPhysReg *I = 3351 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3352 if (I) { 3353 for (; *I; ++I) { 3354 if (AMDGPU::SReg_64RegClass.contains(*I)) 3355 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3356 else if (AMDGPU::SReg_32RegClass.contains(*I)) 3357 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3358 else 3359 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3360 } 3361 } 3362 } 3363 3364 // Update chain and glue. 3365 RetOps[0] = Chain; 3366 if (Glue.getNode()) 3367 RetOps.push_back(Glue); 3368 3369 unsigned Opc = AMDGPUISD::ENDPGM; 3370 if (!IsWaveEnd) 3371 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; 3372 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 3373 } 3374 3375 SDValue SITargetLowering::LowerCallResult( 3376 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg, 3377 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 3378 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, 3379 SDValue ThisVal) const { 3380 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg); 3381 3382 // Assign locations to each value returned by this call. 3383 SmallVector<CCValAssign, 16> RVLocs; 3384 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, 3385 *DAG.getContext()); 3386 CCInfo.AnalyzeCallResult(Ins, RetCC); 3387 3388 // Copy all of the result registers out of their specified physreg. 3389 for (CCValAssign VA : RVLocs) { 3390 SDValue Val; 3391 3392 if (VA.isRegLoc()) { 3393 Val = 3394 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); 3395 Chain = Val.getValue(1); 3396 InGlue = Val.getValue(2); 3397 } else if (VA.isMemLoc()) { 3398 report_fatal_error("TODO: return values in memory"); 3399 } else 3400 llvm_unreachable("unknown argument location type"); 3401 3402 switch (VA.getLocInfo()) { 3403 case CCValAssign::Full: 3404 break; 3405 case CCValAssign::BCvt: 3406 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 3407 break; 3408 case CCValAssign::ZExt: 3409 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val, 3410 DAG.getValueType(VA.getValVT())); 3411 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3412 break; 3413 case CCValAssign::SExt: 3414 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val, 3415 DAG.getValueType(VA.getValVT())); 3416 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3417 break; 3418 case CCValAssign::AExt: 3419 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3420 break; 3421 default: 3422 llvm_unreachable("Unknown loc info!"); 3423 } 3424 3425 InVals.push_back(Val); 3426 } 3427 3428 return Chain; 3429 } 3430 3431 // Add code to pass special inputs required depending on used features separate 3432 // from the explicit user arguments present in the IR. 3433 void SITargetLowering::passSpecialInputs( 3434 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, 3435 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 3436 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const { 3437 // If we don't have a call site, this was a call inserted by 3438 // legalization. These can never use special inputs. 3439 if (!CLI.CB) 3440 return; 3441 3442 SelectionDAG &DAG = CLI.DAG; 3443 const SDLoc &DL = CLI.DL; 3444 const Function &F = DAG.getMachineFunction().getFunction(); 3445 3446 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3447 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); 3448 3449 const AMDGPUFunctionArgInfo *CalleeArgInfo = 3450 &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 3451 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { 3452 // DAG.getPass() returns nullptr when using new pass manager. 3453 // TODO: Use DAG.getMFAM() to access analysis result. 3454 if (DAG.getPass()) { 3455 auto &ArgUsageInfo = 3456 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 3457 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); 3458 } 3459 } 3460 3461 // TODO: Unify with private memory register handling. This is complicated by 3462 // the fact that at least in kernels, the input argument is not necessarily 3463 // in the same location as the input. 3464 // clang-format off 3465 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue, 3466 StringLiteral> ImplicitAttrs[] = { 3467 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"}, 3468 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, 3469 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"}, 3470 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, 3471 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, 3472 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, 3473 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}, 3474 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"}, 3475 }; 3476 // clang-format on 3477 3478 for (auto [InputID, Attr] : ImplicitAttrs) { 3479 // If the callee does not use the attribute value, skip copying the value. 3480 if (CLI.CB->hasFnAttr(Attr)) 3481 continue; 3482 3483 const auto [OutgoingArg, ArgRC, ArgTy] = 3484 CalleeArgInfo->getPreloadedValue(InputID); 3485 if (!OutgoingArg) 3486 continue; 3487 3488 const auto [IncomingArg, IncomingArgRC, Ty] = 3489 CallerArgInfo.getPreloadedValue(InputID); 3490 assert(IncomingArgRC == ArgRC); 3491 3492 // All special arguments are ints for now. 3493 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; 3494 SDValue InputReg; 3495 3496 if (IncomingArg) { 3497 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); 3498 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { 3499 // The implicit arg ptr is special because it doesn't have a corresponding 3500 // input for kernels, and is computed from the kernarg segment pointer. 3501 InputReg = getImplicitArgPtr(DAG, DL); 3502 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { 3503 std::optional<uint32_t> Id = 3504 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 3505 if (Id.has_value()) { 3506 InputReg = DAG.getConstant(*Id, DL, ArgVT); 3507 } else { 3508 InputReg = DAG.getPOISON(ArgVT); 3509 } 3510 } else { 3511 // We may have proven the input wasn't needed, although the ABI is 3512 // requiring it. We just need to allocate the register appropriately. 3513 InputReg = DAG.getPOISON(ArgVT); 3514 } 3515 3516 if (OutgoingArg->isRegister()) { 3517 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 3518 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 3519 report_fatal_error("failed to allocate implicit input argument"); 3520 } else { 3521 unsigned SpecialArgOffset = 3522 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4)); 3523 SDValue ArgStore = 3524 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); 3525 MemOpChains.push_back(ArgStore); 3526 } 3527 } 3528 3529 // Pack workitem IDs into a single register or pass it as is if already 3530 // packed. 3531 3532 auto [OutgoingArg, ArgRC, Ty] = 3533 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3534 if (!OutgoingArg) 3535 std::tie(OutgoingArg, ArgRC, Ty) = 3536 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3537 if (!OutgoingArg) 3538 std::tie(OutgoingArg, ArgRC, Ty) = 3539 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3540 if (!OutgoingArg) 3541 return; 3542 3543 const ArgDescriptor *IncomingArgX = std::get<0>( 3544 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); 3545 const ArgDescriptor *IncomingArgY = std::get<0>( 3546 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); 3547 const ArgDescriptor *IncomingArgZ = std::get<0>( 3548 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); 3549 3550 SDValue InputReg; 3551 SDLoc SL; 3552 3553 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x"); 3554 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y"); 3555 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z"); 3556 3557 // If incoming ids are not packed we need to pack them. 3558 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && 3559 NeedWorkItemIDX) { 3560 if (Subtarget->getMaxWorkitemID(F, 0) != 0) { 3561 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); 3562 } else { 3563 InputReg = DAG.getConstant(0, DL, MVT::i32); 3564 } 3565 } 3566 3567 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && 3568 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { 3569 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); 3570 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, 3571 DAG.getShiftAmountConstant(10, MVT::i32, SL)); 3572 InputReg = InputReg.getNode() 3573 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) 3574 : Y; 3575 } 3576 3577 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && 3578 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { 3579 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); 3580 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, 3581 DAG.getShiftAmountConstant(20, MVT::i32, SL)); 3582 InputReg = InputReg.getNode() 3583 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) 3584 : Z; 3585 } 3586 3587 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { 3588 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { 3589 // We're in a situation where the outgoing function requires the workitem 3590 // ID, but the calling function does not have it (e.g a graphics function 3591 // calling a C calling convention function). This is illegal, but we need 3592 // to produce something. 3593 InputReg = DAG.getPOISON(MVT::i32); 3594 } else { 3595 // Workitem ids are already packed, any of present incoming arguments 3596 // will carry all required fields. 3597 ArgDescriptor IncomingArg = 3598 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX 3599 : IncomingArgY ? *IncomingArgY 3600 : *IncomingArgZ, 3601 ~0u); 3602 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); 3603 } 3604 } 3605 3606 if (OutgoingArg->isRegister()) { 3607 if (InputReg) 3608 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 3609 3610 CCInfo.AllocateReg(OutgoingArg->getRegister()); 3611 } else { 3612 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4)); 3613 if (InputReg) { 3614 SDValue ArgStore = 3615 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); 3616 MemOpChains.push_back(ArgStore); 3617 } 3618 } 3619 } 3620 3621 bool SITargetLowering::isEligibleForTailCallOptimization( 3622 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, 3623 const SmallVectorImpl<ISD::OutputArg> &Outs, 3624 const SmallVectorImpl<SDValue> &OutVals, 3625 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 3626 if (AMDGPU::isChainCC(CalleeCC)) 3627 return true; 3628 3629 if (!AMDGPU::mayTailCallThisCC(CalleeCC)) 3630 return false; 3631 3632 // For a divergent call target, we need to do a waterfall loop over the 3633 // possible callees which precludes us from using a simple jump. 3634 if (Callee->isDivergent()) 3635 return false; 3636 3637 MachineFunction &MF = DAG.getMachineFunction(); 3638 const Function &CallerF = MF.getFunction(); 3639 CallingConv::ID CallerCC = CallerF.getCallingConv(); 3640 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 3641 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3642 3643 // Kernels aren't callable, and don't have a live in return address so it 3644 // doesn't make sense to do a tail call with entry functions. 3645 if (!CallerPreserved) 3646 return false; 3647 3648 bool CCMatch = CallerCC == CalleeCC; 3649 3650 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 3651 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch) 3652 return true; 3653 return false; 3654 } 3655 3656 // TODO: Can we handle var args? 3657 if (IsVarArg) 3658 return false; 3659 3660 for (const Argument &Arg : CallerF.args()) { 3661 if (Arg.hasByValAttr()) 3662 return false; 3663 } 3664 3665 LLVMContext &Ctx = *DAG.getContext(); 3666 3667 // Check that the call results are passed in the same way. 3668 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins, 3669 CCAssignFnForCall(CalleeCC, IsVarArg), 3670 CCAssignFnForCall(CallerCC, IsVarArg))) 3671 return false; 3672 3673 // The callee has to preserve all registers the caller needs to preserve. 3674 if (!CCMatch) { 3675 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3676 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3677 return false; 3678 } 3679 3680 // Nothing more to check if the callee is taking no arguments. 3681 if (Outs.empty()) 3682 return true; 3683 3684 SmallVector<CCValAssign, 16> ArgLocs; 3685 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); 3686 3687 // FIXME: We are not allocating special input registers, so we will be 3688 // deciding based on incorrect register assignments. 3689 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); 3690 3691 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 3692 // If the stack arguments for this call do not fit into our own save area then 3693 // the call cannot be made tail. 3694 // TODO: Is this really necessary? 3695 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) 3696 return false; 3697 3698 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) { 3699 // FIXME: What about inreg arguments that end up passed in memory? 3700 if (!CCVA.isRegLoc()) 3701 continue; 3702 3703 // If we are passing an argument in an SGPR, and the value is divergent, 3704 // this call requires a waterfall loop. 3705 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) { 3706 LLVM_DEBUG( 3707 dbgs() << "Cannot tail call due to divergent outgoing argument in " 3708 << printReg(CCVA.getLocReg(), TRI) << '\n'); 3709 return false; 3710 } 3711 } 3712 3713 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3714 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); 3715 } 3716 3717 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3718 if (!CI->isTailCall()) 3719 return false; 3720 3721 const Function *ParentFn = CI->getParent()->getParent(); 3722 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) 3723 return false; 3724 return true; 3725 } 3726 3727 namespace { 3728 // Chain calls have special arguments that we need to handle. These are 3729 // tagging along at the end of the arguments list(s), after the SGPR and VGPR 3730 // arguments (index 0 and 1 respectively). 3731 enum ChainCallArgIdx { 3732 Exec = 2, 3733 Flags, 3734 NumVGPRs, 3735 FallbackExec, 3736 FallbackCallee 3737 }; 3738 } // anonymous namespace 3739 3740 // The wave scratch offset register is used as the global base pointer. 3741 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, 3742 SmallVectorImpl<SDValue> &InVals) const { 3743 CallingConv::ID CallConv = CLI.CallConv; 3744 bool IsChainCallConv = AMDGPU::isChainCC(CallConv); 3745 3746 SelectionDAG &DAG = CLI.DAG; 3747 3748 const SDLoc &DL = CLI.DL; 3749 SDValue Chain = CLI.Chain; 3750 SDValue Callee = CLI.Callee; 3751 3752 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs; 3753 bool UsesDynamicVGPRs = false; 3754 if (IsChainCallConv) { 3755 // The last arguments should be the value that we need to put in EXEC, 3756 // followed by the flags and any other arguments with special meanings. 3757 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so 3758 // we don't treat them like the "real" arguments. 3759 auto RequestedExecIt = 3760 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) { 3761 return Arg.OrigArgIndex == 2; 3762 }); 3763 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC"); 3764 3765 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin(); 3766 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx, 3767 CLI.OutVals.end()); 3768 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end()); 3769 3770 assert(CLI.Outs.back().OrigArgIndex < 2 && 3771 "Haven't popped all the special args"); 3772 3773 TargetLowering::ArgListEntry RequestedExecArg = 3774 CLI.Args[ChainCallArgIdx::Exec]; 3775 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize())) 3776 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); 3777 3778 // Convert constants into TargetConstants, so they become immediate operands 3779 // instead of being selected into S_MOV. 3780 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) { 3781 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) { 3782 ChainCallSpecialArgs.push_back(DAG.getTargetConstant( 3783 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0))); 3784 } else 3785 ChainCallSpecialArgs.push_back(Arg.Node); 3786 }; 3787 3788 PushNodeOrTargetConstant(RequestedExecArg); 3789 3790 // Process any other special arguments depending on the value of the flags. 3791 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags]; 3792 3793 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue(); 3794 if (FlagsValue.isZero()) { 3795 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1) 3796 return lowerUnhandledCall(CLI, InVals, 3797 "no additional args allowed if flags == 0"); 3798 } else if (FlagsValue.isOneBitSet(0)) { 3799 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) { 3800 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args"); 3801 } 3802 3803 if (!Subtarget->isWave32()) { 3804 return lowerUnhandledCall( 3805 CLI, InVals, "dynamic VGPR mode is only supported for wave32"); 3806 } 3807 3808 UsesDynamicVGPRs = true; 3809 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs, 3810 CLI.Args.end(), PushNodeOrTargetConstant); 3811 } 3812 } 3813 3814 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 3815 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 3816 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 3817 bool &IsTailCall = CLI.IsTailCall; 3818 bool IsVarArg = CLI.IsVarArg; 3819 bool IsSibCall = false; 3820 MachineFunction &MF = DAG.getMachineFunction(); 3821 3822 if (Callee.isUndef() || isNullConstant(Callee)) { 3823 if (!CLI.IsTailCall) { 3824 for (ISD::InputArg &Arg : CLI.Ins) 3825 InVals.push_back(DAG.getPOISON(Arg.VT)); 3826 } 3827 3828 return Chain; 3829 } 3830 3831 if (IsVarArg) { 3832 return lowerUnhandledCall(CLI, InVals, 3833 "unsupported call to variadic function "); 3834 } 3835 3836 if (!CLI.CB) 3837 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization"); 3838 3839 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { 3840 return lowerUnhandledCall(CLI, InVals, 3841 "unsupported required tail call to function "); 3842 } 3843 3844 if (IsTailCall) { 3845 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg, 3846 Outs, OutVals, Ins, DAG); 3847 if (!IsTailCall && 3848 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { 3849 report_fatal_error("failed to perform tail call elimination on a call " 3850 "site marked musttail or on llvm.amdgcn.cs.chain"); 3851 } 3852 3853 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 3854 3855 // A sibling call is one where we're under the usual C ABI and not planning 3856 // to change that but can still do a tail call: 3857 if (!TailCallOpt && IsTailCall) 3858 IsSibCall = true; 3859 3860 if (IsTailCall) 3861 ++NumTailCalls; 3862 } 3863 3864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3865 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3866 SmallVector<SDValue, 8> MemOpChains; 3867 3868 // Analyze operands of the call, assigning locations to each operand. 3869 SmallVector<CCValAssign, 16> ArgLocs; 3870 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 3871 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); 3872 3873 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { 3874 // With a fixed ABI, allocate fixed registers before user arguments. 3875 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); 3876 } 3877 3878 CCInfo.AnalyzeCallOperands(Outs, AssignFn); 3879 3880 // Get a count of how many bytes are to be pushed on the stack. 3881 unsigned NumBytes = CCInfo.getStackSize(); 3882 3883 if (IsSibCall) { 3884 // Since we're not changing the ABI to make this a tail call, the memory 3885 // operands are already available in the caller's incoming argument space. 3886 NumBytes = 0; 3887 } 3888 3889 // FPDiff is the byte offset of the call's argument area from the callee's. 3890 // Stores to callee stack arguments will be placed in FixedStackSlots offset 3891 // by this amount for a tail call. In a sibling call it must be 0 because the 3892 // caller will deallocate the entire stack and the callee still expects its 3893 // arguments to begin at SP+0. Completely unused for non-tail calls. 3894 int32_t FPDiff = 0; 3895 MachineFrameInfo &MFI = MF.getFrameInfo(); 3896 auto *TRI = Subtarget->getRegisterInfo(); 3897 3898 // Adjust the stack pointer for the new arguments... 3899 // These operations are automatically eliminated by the prolog/epilog pass 3900 if (!IsSibCall) 3901 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); 3902 3903 if (!IsSibCall || IsChainCallConv) { 3904 if (!Subtarget->enableFlatScratch()) { 3905 SmallVector<SDValue, 4> CopyFromChains; 3906 3907 // In the HSA case, this should be an identity copy. 3908 SDValue ScratchRSrcReg = 3909 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); 3910 RegsToPass.emplace_back(IsChainCallConv 3911 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 3912 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, 3913 ScratchRSrcReg); 3914 CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); 3915 Chain = DAG.getTokenFactor(DL, CopyFromChains); 3916 } 3917 } 3918 3919 const unsigned NumSpecialInputs = RegsToPass.size(); 3920 3921 MVT PtrVT = MVT::i32; 3922 3923 // Walk the register/memloc assignments, inserting copies/loads. 3924 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3925 CCValAssign &VA = ArgLocs[i]; 3926 SDValue Arg = OutVals[i]; 3927 3928 // Promote the value if needed. 3929 switch (VA.getLocInfo()) { 3930 case CCValAssign::Full: 3931 break; 3932 case CCValAssign::BCvt: 3933 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3934 break; 3935 case CCValAssign::ZExt: 3936 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3937 break; 3938 case CCValAssign::SExt: 3939 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3940 break; 3941 case CCValAssign::AExt: 3942 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3943 break; 3944 case CCValAssign::FPExt: 3945 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3946 break; 3947 default: 3948 llvm_unreachable("Unknown loc info!"); 3949 } 3950 3951 if (VA.isRegLoc()) { 3952 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg)); 3953 } else { 3954 assert(VA.isMemLoc()); 3955 3956 SDValue DstAddr; 3957 MachinePointerInfo DstInfo; 3958 3959 unsigned LocMemOffset = VA.getLocMemOffset(); 3960 int32_t Offset = LocMemOffset; 3961 3962 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); 3963 MaybeAlign Alignment; 3964 3965 if (IsTailCall) { 3966 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3967 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() 3968 : VA.getValVT().getStoreSize(); 3969 3970 // FIXME: We can have better than the minimum byval required alignment. 3971 Alignment = 3972 Flags.isByVal() 3973 ? Flags.getNonZeroByValAlign() 3974 : commonAlignment(Subtarget->getStackAlignment(), Offset); 3975 3976 Offset = Offset + FPDiff; 3977 int FI = MFI.CreateFixedObject(OpSize, Offset, true); 3978 3979 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3980 DstInfo = MachinePointerInfo::getFixedStack(MF, FI); 3981 3982 // Make sure any stack arguments overlapping with where we're storing 3983 // are loaded before this eventual operation. Otherwise they'll be 3984 // clobbered. 3985 3986 // FIXME: Why is this really necessary? This seems to just result in a 3987 // lot of code to copy the stack and write them back to the same 3988 // locations, which are supposed to be immutable? 3989 Chain = addTokenForArgument(Chain, DAG, MFI, FI); 3990 } else { 3991 // Stores to the argument stack area are relative to the stack pointer. 3992 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), 3993 MVT::i32); 3994 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); 3995 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); 3996 Alignment = 3997 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); 3998 } 3999 4000 if (Outs[i].Flags.isByVal()) { 4001 SDValue SizeNode = 4002 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); 4003 SDValue Cpy = 4004 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode, 4005 Outs[i].Flags.getNonZeroByValAlign(), 4006 /*isVol = */ false, /*AlwaysInline = */ true, 4007 /*CI=*/nullptr, std::nullopt, DstInfo, 4008 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); 4009 4010 MemOpChains.push_back(Cpy); 4011 } else { 4012 SDValue Store = 4013 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment); 4014 MemOpChains.push_back(Store); 4015 } 4016 } 4017 } 4018 4019 if (!MemOpChains.empty()) 4020 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 4021 4022 SDValue ReadFirstLaneID = 4023 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); 4024 4025 SDValue TokenGlue; 4026 if (CLI.ConvergenceControlToken) { 4027 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue, 4028 CLI.ConvergenceControlToken); 4029 } 4030 4031 // Build a sequence of copy-to-reg nodes chained together with token chain 4032 // and flag operands which copy the outgoing args into the appropriate regs. 4033 SDValue InGlue; 4034 4035 unsigned ArgIdx = 0; 4036 for (auto [Reg, Val] : RegsToPass) { 4037 if (ArgIdx++ >= NumSpecialInputs && 4038 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) { 4039 // For chain calls, the inreg arguments are required to be 4040 // uniform. Speculatively Insert a readfirstlane in case we cannot prove 4041 // they are uniform. 4042 // 4043 // For other calls, if an inreg arguments is known to be uniform, 4044 // speculatively insert a readfirstlane in case it is in a VGPR. 4045 // 4046 // FIXME: We need to execute this in a waterfall loop if it is a divergent 4047 // value, so let that continue to produce invalid code. 4048 4049 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val}); 4050 if (TokenGlue) 4051 ReadfirstlaneArgs.push_back(TokenGlue); 4052 Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(), 4053 ReadfirstlaneArgs); 4054 } 4055 4056 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue); 4057 InGlue = Chain.getValue(1); 4058 } 4059 4060 // We don't usually want to end the call-sequence here because we would tidy 4061 // the frame up *after* the call, however in the ABI-changing tail-call case 4062 // we've carefully laid out the parameters so that when sp is reset they'll be 4063 // in the correct location. 4064 if (IsTailCall && !IsSibCall) { 4065 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL); 4066 InGlue = Chain.getValue(1); 4067 } 4068 4069 std::vector<SDValue> Ops({Chain}); 4070 4071 // Add a redundant copy of the callee global which will not be legalized, as 4072 // we need direct access to the callee later. 4073 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) { 4074 const GlobalValue *GV = GSD->getGlobal(); 4075 Ops.push_back(Callee); 4076 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); 4077 } else { 4078 if (IsTailCall) { 4079 // isEligibleForTailCallOptimization considered whether the call target is 4080 // divergent, but we may still end up with a uniform value in a VGPR. 4081 // Insert a readfirstlane just in case. 4082 SDValue ReadFirstLaneID = 4083 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); 4084 4085 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee}); 4086 if (TokenGlue) 4087 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token. 4088 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(), 4089 ReadfirstlaneArgs); 4090 } 4091 4092 Ops.push_back(Callee); 4093 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); 4094 } 4095 4096 if (IsTailCall) { 4097 // Each tail call may have to adjust the stack by a different amount, so 4098 // this information must travel along with the operation for eventual 4099 // consumption by emitEpilogue. 4100 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 4101 } 4102 4103 if (IsChainCallConv) 4104 llvm::append_range(Ops, ChainCallSpecialArgs); 4105 4106 // Add argument registers to the end of the list so that they are known live 4107 // into the call. 4108 for (auto &[Reg, Val] : RegsToPass) 4109 Ops.push_back(DAG.getRegister(Reg, Val.getValueType())); 4110 4111 // Add a register mask operand representing the call-preserved registers. 4112 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); 4113 assert(Mask && "Missing call preserved mask for calling convention"); 4114 Ops.push_back(DAG.getRegisterMask(Mask)); 4115 4116 if (SDValue Token = CLI.ConvergenceControlToken) { 4117 SmallVector<SDValue, 2> GlueOps; 4118 GlueOps.push_back(Token); 4119 if (InGlue) 4120 GlueOps.push_back(InGlue); 4121 4122 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL, 4123 MVT::Glue, GlueOps), 4124 0); 4125 } 4126 4127 if (InGlue) 4128 Ops.push_back(InGlue); 4129 4130 // If we're doing a tall call, use a TC_RETURN here rather than an 4131 // actual call instruction. 4132 if (IsTailCall) { 4133 MFI.setHasTailCall(); 4134 unsigned OPC = AMDGPUISD::TC_RETURN; 4135 switch (CallConv) { 4136 case CallingConv::AMDGPU_Gfx: 4137 OPC = AMDGPUISD::TC_RETURN_GFX; 4138 break; 4139 case CallingConv::AMDGPU_CS_Chain: 4140 case CallingConv::AMDGPU_CS_ChainPreserve: 4141 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR 4142 : AMDGPUISD::TC_RETURN_CHAIN; 4143 break; 4144 } 4145 4146 return DAG.getNode(OPC, DL, MVT::Other, Ops); 4147 } 4148 4149 // Returns a chain and a flag for retval copy to use. 4150 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops); 4151 Chain = Call.getValue(0); 4152 InGlue = Call.getValue(1); 4153 4154 uint64_t CalleePopBytes = NumBytes; 4155 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL); 4156 if (!Ins.empty()) 4157 InGlue = Chain.getValue(1); 4158 4159 // Handle result values, copying them out of physregs into vregs that we 4160 // return. 4161 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG, 4162 InVals, /*IsThisReturn=*/false, SDValue()); 4163 } 4164 4165 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC, 4166 // except for: 4167 // 1. Stack growth direction(default: downwards, AMDGPU: upwards), and 4168 // 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size 4169 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 4170 SelectionDAG &DAG) const { 4171 const MachineFunction &MF = DAG.getMachineFunction(); 4172 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4173 4174 SDLoc dl(Op); 4175 EVT VT = Op.getValueType(); 4176 SDValue Chain = Op.getOperand(0); 4177 Register SPReg = Info->getStackPtrOffsetReg(); 4178 4179 // Chain the dynamic stack allocation so that it doesn't modify the stack 4180 // pointer when other instructions are using the stack. 4181 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 4182 4183 SDValue Size = Op.getOperand(1); 4184 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 4185 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue(); 4186 4187 const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); 4188 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp && 4189 "Stack grows upwards for AMDGPU"); 4190 4191 Chain = BaseAddr.getValue(1); 4192 Align StackAlign = TFL->getStackAlign(); 4193 if (Alignment > StackAlign) { 4194 uint64_t ScaledAlignment = (uint64_t)Alignment.value() 4195 << Subtarget->getWavefrontSizeLog2(); 4196 uint64_t StackAlignMask = ScaledAlignment - 1; 4197 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, 4198 DAG.getConstant(StackAlignMask, dl, VT)); 4199 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr, 4200 DAG.getSignedConstant(-ScaledAlignment, dl, VT)); 4201 } 4202 4203 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit"); 4204 SDValue NewSP; 4205 if (isa<ConstantSDNode>(Size)) { 4206 // For constant sized alloca, scale alloca size by wave-size 4207 SDValue ScaledSize = DAG.getNode( 4208 ISD::SHL, dl, VT, Size, 4209 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); 4210 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value 4211 } else { 4212 // For dynamic sized alloca, perform wave-wide reduction to get max of 4213 // alloca size(divergent) and then scale it by wave-size 4214 SDValue WaveReduction = 4215 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32); 4216 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction, 4217 Size, DAG.getConstant(0, dl, MVT::i32)); 4218 SDValue ScaledSize = DAG.getNode( 4219 ISD::SHL, dl, VT, Size, 4220 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); 4221 NewSP = 4222 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr. 4223 SDValue ReadFirstLaneID = 4224 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32); 4225 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID, 4226 NewSP); 4227 } 4228 4229 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain 4230 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); 4231 4232 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl); 4233 } 4234 4235 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { 4236 if (Op.getValueType() != MVT::i32) 4237 return Op; // Defer to cannot select error. 4238 4239 Register SP = getStackPointerRegisterToSaveRestore(); 4240 SDLoc SL(Op); 4241 4242 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); 4243 4244 // Convert from wave uniform to swizzled vector address. This should protect 4245 // from any edge cases where the stacksave result isn't directly used with 4246 // stackrestore. 4247 SDValue VectorAddress = 4248 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); 4249 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL); 4250 } 4251 4252 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, 4253 SelectionDAG &DAG) const { 4254 SDLoc SL(Op); 4255 assert(Op.getValueType() == MVT::i32); 4256 4257 uint32_t BothRoundHwReg = 4258 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); 4259 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); 4260 4261 SDValue IntrinID = 4262 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); 4263 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), 4264 Op.getOperand(0), IntrinID, GetRoundBothImm); 4265 4266 // There are two rounding modes, one for f32 and one for f64/f16. We only 4267 // report in the standard value range if both are the same. 4268 // 4269 // The raw values also differ from the expected FLT_ROUNDS values. Nearest 4270 // ties away from zero is not supported, and the other values are rotated by 4271 // 1. 4272 // 4273 // If the two rounding modes are not the same, report a target defined value. 4274 4275 // Mode register rounding mode fields: 4276 // 4277 // [1:0] Single-precision round mode. 4278 // [3:2] Double/Half-precision round mode. 4279 // 4280 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. 4281 // 4282 // Hardware Spec 4283 // Toward-0 3 0 4284 // Nearest Even 0 1 4285 // +Inf 1 2 4286 // -Inf 2 3 4287 // NearestAway0 N/A 4 4288 // 4289 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit 4290 // table we can index by the raw hardware mode. 4291 // 4292 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf 4293 4294 SDValue BitTable = 4295 DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); 4296 4297 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4298 SDValue RoundModeTimesNumBits = 4299 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); 4300 4301 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we 4302 // knew only one mode was demanded. 4303 SDValue TableValue = 4304 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); 4305 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); 4306 4307 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); 4308 SDValue TableEntry = 4309 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); 4310 4311 // There's a gap in the 4-bit encoded table and actual enum values, so offset 4312 // if it's an extended value. 4313 SDValue Four = DAG.getConstant(4, SL, MVT::i32); 4314 SDValue IsStandardValue = 4315 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); 4316 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); 4317 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, 4318 TableEntry, EnumOffset); 4319 4320 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); 4321 } 4322 4323 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, 4324 SelectionDAG &DAG) const { 4325 SDLoc SL(Op); 4326 4327 SDValue NewMode = Op.getOperand(1); 4328 assert(NewMode.getValueType() == MVT::i32); 4329 4330 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the 4331 // hardware MODE.fp_round values. 4332 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) { 4333 uint32_t ClampedVal = std::min( 4334 static_cast<uint32_t>(ConstMode->getZExtValue()), 4335 static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64)); 4336 NewMode = DAG.getConstant( 4337 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32); 4338 } else { 4339 // If we know the input can only be one of the supported standard modes in 4340 // the range 0-3, we can use a simplified mapping to hardware values. 4341 KnownBits KB = DAG.computeKnownBits(NewMode); 4342 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30; 4343 // The supported standard values are 0-3. The extended values start at 8. We 4344 // need to offset by 4 if the value is in the extended range. 4345 4346 if (UseReducedTable) { 4347 // Truncate to the low 32-bits. 4348 SDValue BitTable = DAG.getConstant( 4349 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32); 4350 4351 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4352 SDValue RoundModeTimesNumBits = 4353 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two); 4354 4355 NewMode = 4356 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits); 4357 4358 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce 4359 // the table extracted bits into inline immediates. 4360 } else { 4361 // table_index = umin(value, value - 4) 4362 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf 4363 SDValue BitTable = 4364 DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64); 4365 4366 SDValue Four = DAG.getConstant(4, SL, MVT::i32); 4367 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four); 4368 SDValue IndexVal = 4369 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum); 4370 4371 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4372 SDValue RoundModeTimesNumBits = 4373 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two); 4374 4375 SDValue TableValue = 4376 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); 4377 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); 4378 4379 // No need to mask out the high bits since the setreg will ignore them 4380 // anyway. 4381 NewMode = TruncTable; 4382 } 4383 4384 // Insert a readfirstlane in case the value is a VGPR. We could do this 4385 // earlier and keep more operations scalar, but that interferes with 4386 // combining the source. 4387 SDValue ReadFirstLaneID = 4388 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); 4389 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4390 ReadFirstLaneID, NewMode); 4391 } 4392 4393 // N.B. The setreg will be later folded into s_round_mode on supported 4394 // targets. 4395 SDValue IntrinID = 4396 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); 4397 uint32_t BothRoundHwReg = 4398 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); 4399 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); 4400 4401 SDValue SetReg = 4402 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0), 4403 IntrinID, RoundBothImm, NewMode); 4404 4405 return SetReg; 4406 } 4407 4408 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { 4409 if (Op->isDivergent()) 4410 return SDValue(); 4411 4412 switch (cast<MemSDNode>(Op)->getAddressSpace()) { 4413 case AMDGPUAS::FLAT_ADDRESS: 4414 case AMDGPUAS::GLOBAL_ADDRESS: 4415 case AMDGPUAS::CONSTANT_ADDRESS: 4416 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 4417 break; 4418 default: 4419 return SDValue(); 4420 } 4421 4422 return Op; 4423 } 4424 4425 // Work around DAG legality rules only based on the result type. 4426 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 4427 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; 4428 SDValue Src = Op.getOperand(IsStrict ? 1 : 0); 4429 EVT SrcVT = Src.getValueType(); 4430 4431 if (SrcVT.getScalarType() != MVT::bf16) 4432 return Op; 4433 4434 SDLoc SL(Op); 4435 SDValue BitCast = 4436 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src); 4437 4438 EVT DstVT = Op.getValueType(); 4439 if (IsStrict) 4440 llvm_unreachable("Need STRICT_BF16_TO_FP"); 4441 4442 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast); 4443 } 4444 4445 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const { 4446 SDLoc SL(Op); 4447 if (Op.getValueType() != MVT::i64) 4448 return Op; 4449 4450 uint32_t ModeHwReg = 4451 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 4452 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); 4453 uint32_t TrapHwReg = 4454 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 4455 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); 4456 4457 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other); 4458 SDValue IntrinID = 4459 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); 4460 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList, 4461 Op.getOperand(0), IntrinID, ModeHwRegImm); 4462 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList, 4463 Op.getOperand(0), IntrinID, TrapHwRegImm); 4464 SDValue TokenReg = 4465 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1), 4466 GetTrapReg.getValue(1)); 4467 4468 SDValue CvtPtr = 4469 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg); 4470 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); 4471 4472 return DAG.getMergeValues({Result, TokenReg}, SL); 4473 } 4474 4475 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const { 4476 SDLoc SL(Op); 4477 if (Op.getOperand(1).getValueType() != MVT::i64) 4478 return Op; 4479 4480 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1)); 4481 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, 4482 DAG.getConstant(0, SL, MVT::i32)); 4483 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, 4484 DAG.getConstant(1, SL, MVT::i32)); 4485 4486 SDValue ReadFirstLaneID = 4487 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); 4488 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4489 ReadFirstLaneID, NewModeReg); 4490 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4491 ReadFirstLaneID, NewTrapReg); 4492 4493 unsigned ModeHwReg = 4494 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 4495 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); 4496 unsigned TrapHwReg = 4497 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 4498 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); 4499 4500 SDValue IntrinID = 4501 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); 4502 SDValue SetModeReg = 4503 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), 4504 IntrinID, ModeHwRegImm, NewModeReg); 4505 SDValue SetTrapReg = 4506 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), 4507 IntrinID, TrapHwRegImm, NewTrapReg); 4508 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg); 4509 } 4510 4511 Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT, 4512 const MachineFunction &MF) const { 4513 const Function &Fn = MF.getFunction(); 4514 4515 Register Reg = StringSwitch<Register>(RegName) 4516 .Case("m0", AMDGPU::M0) 4517 .Case("exec", AMDGPU::EXEC) 4518 .Case("exec_lo", AMDGPU::EXEC_LO) 4519 .Case("exec_hi", AMDGPU::EXEC_HI) 4520 .Case("flat_scratch", AMDGPU::FLAT_SCR) 4521 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 4522 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 4523 .Default(Register()); 4524 if (!Reg) 4525 return Reg; 4526 4527 if (!Subtarget->hasFlatScrRegister() && 4528 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 4529 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) + 4530 "\" for subtarget.")); 4531 } 4532 4533 switch (Reg) { 4534 case AMDGPU::M0: 4535 case AMDGPU::EXEC_LO: 4536 case AMDGPU::EXEC_HI: 4537 case AMDGPU::FLAT_SCR_LO: 4538 case AMDGPU::FLAT_SCR_HI: 4539 if (VT.getSizeInBits() == 32) 4540 return Reg; 4541 break; 4542 case AMDGPU::EXEC: 4543 case AMDGPU::FLAT_SCR: 4544 if (VT.getSizeInBits() == 64) 4545 return Reg; 4546 break; 4547 default: 4548 llvm_unreachable("missing register type checking"); 4549 } 4550 4551 report_fatal_error( 4552 Twine("invalid type for register \"" + StringRef(RegName) + "\".")); 4553 } 4554 4555 // If kill is not the last instruction, split the block so kill is always a 4556 // proper terminator. 4557 MachineBasicBlock * 4558 SITargetLowering::splitKillBlock(MachineInstr &MI, 4559 MachineBasicBlock *BB) const { 4560 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true); 4561 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4562 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); 4563 return SplitBB; 4564 } 4565 4566 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, 4567 // \p MI will be the only instruction in the loop body block. Otherwise, it will 4568 // be the first instruction in the remainder block. 4569 // 4570 /// \returns { LoopBody, Remainder } 4571 static std::pair<MachineBasicBlock *, MachineBasicBlock *> 4572 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { 4573 MachineFunction *MF = MBB.getParent(); 4574 MachineBasicBlock::iterator I(&MI); 4575 4576 // To insert the loop we need to split the block. Move everything after this 4577 // point to a new block, and insert a new empty block between the two. 4578 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 4579 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 4580 MachineFunction::iterator MBBI(MBB); 4581 ++MBBI; 4582 4583 MF->insert(MBBI, LoopBB); 4584 MF->insert(MBBI, RemainderBB); 4585 4586 LoopBB->addSuccessor(LoopBB); 4587 LoopBB->addSuccessor(RemainderBB); 4588 4589 // Move the rest of the block into a new block. 4590 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4591 4592 if (InstInLoop) { 4593 auto Next = std::next(I); 4594 4595 // Move instruction to loop body. 4596 LoopBB->splice(LoopBB->begin(), &MBB, I, Next); 4597 4598 // Move the rest of the block. 4599 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); 4600 } else { 4601 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4602 } 4603 4604 MBB.addSuccessor(LoopBB); 4605 4606 return std::pair(LoopBB, RemainderBB); 4607 } 4608 4609 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. 4610 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { 4611 MachineBasicBlock *MBB = MI.getParent(); 4612 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4613 auto I = MI.getIterator(); 4614 auto E = std::next(I); 4615 4616 // clang-format off 4617 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 4618 .addImm(0); 4619 // clang-format on 4620 4621 MIBundleBuilder Bundler(*MBB, I, E); 4622 finalizeBundle(*MBB, Bundler.begin()); 4623 } 4624 4625 MachineBasicBlock * 4626 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, 4627 MachineBasicBlock *BB) const { 4628 const DebugLoc &DL = MI.getDebugLoc(); 4629 4630 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 4631 4632 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4633 4634 // Apparently kill flags are only valid if the def is in the same block? 4635 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) 4636 Src->setIsKill(false); 4637 4638 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true); 4639 4640 MachineBasicBlock::iterator I = LoopBB->end(); 4641 4642 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( 4643 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); 4644 4645 // Clear TRAP_STS.MEM_VIOL 4646 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) 4647 .addImm(0) 4648 .addImm(EncodedReg); 4649 4650 bundleInstWithWaitcnt(MI); 4651 4652 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4653 4654 // Load and check TRAP_STS.MEM_VIOL 4655 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) 4656 .addImm(EncodedReg); 4657 4658 // FIXME: Do we need to use an isel pseudo that may clobber scc? 4659 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 4660 .addReg(Reg, RegState::Kill) 4661 .addImm(0); 4662 // clang-format off 4663 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 4664 .addMBB(LoopBB); 4665 // clang-format on 4666 4667 return RemainderBB; 4668 } 4669 4670 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 4671 // wavefront. If the value is uniform and just happens to be in a VGPR, this 4672 // will only do one iteration. In the worst case, this will loop 64 times. 4673 // 4674 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 4675 static MachineBasicBlock::iterator 4676 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, 4677 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4678 const DebugLoc &DL, const MachineOperand &Idx, 4679 unsigned InitReg, unsigned ResultReg, unsigned PhiReg, 4680 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, 4681 Register &SGPRIdxReg) { 4682 4683 MachineFunction *MF = OrigBB.getParent(); 4684 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4685 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4686 MachineBasicBlock::iterator I = LoopBB.begin(); 4687 4688 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 4689 Register PhiExec = MRI.createVirtualRegister(BoolRC); 4690 Register NewExec = MRI.createVirtualRegister(BoolRC); 4691 Register CurrentIdxReg = 4692 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4693 Register CondReg = MRI.createVirtualRegister(BoolRC); 4694 4695 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 4696 .addReg(InitReg) 4697 .addMBB(&OrigBB) 4698 .addReg(ResultReg) 4699 .addMBB(&LoopBB); 4700 4701 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 4702 .addReg(InitSaveExecReg) 4703 .addMBB(&OrigBB) 4704 .addReg(NewExec) 4705 .addMBB(&LoopBB); 4706 4707 // Read the next variant <- also loop target. 4708 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 4709 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef())); 4710 4711 // Compare the just read M0 value to all possible Idx values. 4712 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 4713 .addReg(CurrentIdxReg) 4714 .addReg(Idx.getReg(), 0, Idx.getSubReg()); 4715 4716 // Update EXEC, save the original EXEC value to VCC. 4717 BuildMI(LoopBB, I, DL, 4718 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 4719 : AMDGPU::S_AND_SAVEEXEC_B64), 4720 NewExec) 4721 .addReg(CondReg, RegState::Kill); 4722 4723 MRI.setSimpleHint(NewExec, CondReg); 4724 4725 if (UseGPRIdxMode) { 4726 if (Offset == 0) { 4727 SGPRIdxReg = CurrentIdxReg; 4728 } else { 4729 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4730 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg) 4731 .addReg(CurrentIdxReg, RegState::Kill) 4732 .addImm(Offset); 4733 } 4734 } else { 4735 // Move index from VCC into M0 4736 if (Offset == 0) { 4737 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 4738 .addReg(CurrentIdxReg, RegState::Kill); 4739 } else { 4740 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 4741 .addReg(CurrentIdxReg, RegState::Kill) 4742 .addImm(Offset); 4743 } 4744 } 4745 4746 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4747 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4748 MachineInstr *InsertPt = 4749 BuildMI(LoopBB, I, DL, 4750 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term 4751 : AMDGPU::S_XOR_B64_term), 4752 Exec) 4753 .addReg(Exec) 4754 .addReg(NewExec); 4755 4756 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 4757 // s_cbranch_scc0? 4758 4759 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 4760 // clang-format off 4761 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 4762 .addMBB(&LoopBB); 4763 // clang-format on 4764 4765 return InsertPt->getIterator(); 4766 } 4767 4768 // This has slightly sub-optimal regalloc when the source vector is killed by 4769 // the read. The register allocator does not understand that the kill is 4770 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 4771 // subregister from it, using 1 more VGPR than necessary. This was saved when 4772 // this was expanded after register allocation. 4773 static MachineBasicBlock::iterator 4774 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, 4775 unsigned InitResultReg, unsigned PhiReg, int Offset, 4776 bool UseGPRIdxMode, Register &SGPRIdxReg) { 4777 MachineFunction *MF = MBB.getParent(); 4778 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4779 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4780 MachineRegisterInfo &MRI = MF->getRegInfo(); 4781 const DebugLoc &DL = MI.getDebugLoc(); 4782 MachineBasicBlock::iterator I(&MI); 4783 4784 const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); 4785 Register DstReg = MI.getOperand(0).getReg(); 4786 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4787 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); 4788 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4789 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4790 4791 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 4792 4793 // Save the EXEC mask 4794 // clang-format off 4795 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) 4796 .addReg(Exec); 4797 // clang-format on 4798 4799 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false); 4800 4801 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4802 4803 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 4804 InitResultReg, DstReg, PhiReg, TmpExec, 4805 Offset, UseGPRIdxMode, SGPRIdxReg); 4806 4807 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock(); 4808 MachineFunction::iterator MBBI(LoopBB); 4809 ++MBBI; 4810 MF->insert(MBBI, LandingPad); 4811 LoopBB->removeSuccessor(RemainderBB); 4812 LandingPad->addSuccessor(RemainderBB); 4813 LoopBB->addSuccessor(LandingPad); 4814 MachineBasicBlock::iterator First = LandingPad->begin(); 4815 // clang-format off 4816 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) 4817 .addReg(SaveExec); 4818 // clang-format on 4819 4820 return InsPt; 4821 } 4822 4823 // Returns subreg index, offset 4824 static std::pair<unsigned, int> 4825 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 4826 const TargetRegisterClass *SuperRC, unsigned VecReg, 4827 int Offset) { 4828 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; 4829 4830 // Skip out of bounds offsets, or else we would end up using an undefined 4831 // register. 4832 if (Offset >= NumElts || Offset < 0) 4833 return std::pair(AMDGPU::sub0, Offset); 4834 4835 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0); 4836 } 4837 4838 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, 4839 MachineRegisterInfo &MRI, MachineInstr &MI, 4840 int Offset) { 4841 MachineBasicBlock *MBB = MI.getParent(); 4842 const DebugLoc &DL = MI.getDebugLoc(); 4843 MachineBasicBlock::iterator I(&MI); 4844 4845 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4846 4847 assert(Idx->getReg() != AMDGPU::NoRegister); 4848 4849 if (Offset == 0) { 4850 // clang-format off 4851 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) 4852 .add(*Idx); 4853 // clang-format on 4854 } else { 4855 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 4856 .add(*Idx) 4857 .addImm(Offset); 4858 } 4859 } 4860 4861 static Register getIndirectSGPRIdx(const SIInstrInfo *TII, 4862 MachineRegisterInfo &MRI, MachineInstr &MI, 4863 int Offset) { 4864 MachineBasicBlock *MBB = MI.getParent(); 4865 const DebugLoc &DL = MI.getDebugLoc(); 4866 MachineBasicBlock::iterator I(&MI); 4867 4868 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4869 4870 if (Offset == 0) 4871 return Idx->getReg(); 4872 4873 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4874 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 4875 .add(*Idx) 4876 .addImm(Offset); 4877 return Tmp; 4878 } 4879 4880 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 4881 MachineBasicBlock &MBB, 4882 const GCNSubtarget &ST) { 4883 const SIInstrInfo *TII = ST.getInstrInfo(); 4884 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 4885 MachineFunction *MF = MBB.getParent(); 4886 MachineRegisterInfo &MRI = MF->getRegInfo(); 4887 4888 Register Dst = MI.getOperand(0).getReg(); 4889 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4890 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); 4891 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 4892 4893 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); 4894 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 4895 4896 unsigned SubReg; 4897 std::tie(SubReg, Offset) = 4898 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); 4899 4900 const bool UseGPRIdxMode = ST.useVGPRIndexMode(); 4901 4902 // Check for a SGPR index. 4903 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { 4904 MachineBasicBlock::iterator I(&MI); 4905 const DebugLoc &DL = MI.getDebugLoc(); 4906 4907 if (UseGPRIdxMode) { 4908 // TODO: Look at the uses to avoid the copy. This may require rescheduling 4909 // to avoid interfering with other uses, so probably requires a new 4910 // optimization pass. 4911 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); 4912 4913 const MCInstrDesc &GPRIDXDesc = 4914 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); 4915 BuildMI(MBB, I, DL, GPRIDXDesc, Dst) 4916 .addReg(SrcReg) 4917 .addReg(Idx) 4918 .addImm(SubReg); 4919 } else { 4920 setM0ToIndexFromSGPR(TII, MRI, MI, Offset); 4921 4922 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 4923 .addReg(SrcReg, 0, SubReg) 4924 .addReg(SrcReg, RegState::Implicit); 4925 } 4926 4927 MI.eraseFromParent(); 4928 4929 return &MBB; 4930 } 4931 4932 // Control flow needs to be inserted if indexing with a VGPR. 4933 const DebugLoc &DL = MI.getDebugLoc(); 4934 MachineBasicBlock::iterator I(&MI); 4935 4936 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4937 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4938 4939 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 4940 4941 Register SGPRIdxReg; 4942 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, 4943 UseGPRIdxMode, SGPRIdxReg); 4944 4945 MachineBasicBlock *LoopBB = InsPt->getParent(); 4946 4947 if (UseGPRIdxMode) { 4948 const MCInstrDesc &GPRIDXDesc = 4949 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); 4950 4951 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) 4952 .addReg(SrcReg) 4953 .addReg(SGPRIdxReg) 4954 .addImm(SubReg); 4955 } else { 4956 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 4957 .addReg(SrcReg, 0, SubReg) 4958 .addReg(SrcReg, RegState::Implicit); 4959 } 4960 4961 MI.eraseFromParent(); 4962 4963 return LoopBB; 4964 } 4965 4966 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 4967 MachineBasicBlock &MBB, 4968 const GCNSubtarget &ST) { 4969 const SIInstrInfo *TII = ST.getInstrInfo(); 4970 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 4971 MachineFunction *MF = MBB.getParent(); 4972 MachineRegisterInfo &MRI = MF->getRegInfo(); 4973 4974 Register Dst = MI.getOperand(0).getReg(); 4975 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 4976 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4977 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 4978 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 4979 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 4980 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 4981 4982 // This can be an immediate, but will be folded later. 4983 assert(Val->getReg()); 4984 4985 unsigned SubReg; 4986 std::tie(SubReg, Offset) = 4987 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); 4988 const bool UseGPRIdxMode = ST.useVGPRIndexMode(); 4989 4990 if (Idx->getReg() == AMDGPU::NoRegister) { 4991 MachineBasicBlock::iterator I(&MI); 4992 const DebugLoc &DL = MI.getDebugLoc(); 4993 4994 assert(Offset == 0); 4995 4996 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 4997 .add(*SrcVec) 4998 .add(*Val) 4999 .addImm(SubReg); 5000 5001 MI.eraseFromParent(); 5002 return &MBB; 5003 } 5004 5005 // Check for a SGPR index. 5006 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { 5007 MachineBasicBlock::iterator I(&MI); 5008 const DebugLoc &DL = MI.getDebugLoc(); 5009 5010 if (UseGPRIdxMode) { 5011 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); 5012 5013 const MCInstrDesc &GPRIDXDesc = 5014 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 5015 BuildMI(MBB, I, DL, GPRIDXDesc, Dst) 5016 .addReg(SrcVec->getReg()) 5017 .add(*Val) 5018 .addReg(Idx) 5019 .addImm(SubReg); 5020 } else { 5021 setM0ToIndexFromSGPR(TII, MRI, MI, Offset); 5022 5023 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( 5024 TRI.getRegSizeInBits(*VecRC), 32, false); 5025 BuildMI(MBB, I, DL, MovRelDesc, Dst) 5026 .addReg(SrcVec->getReg()) 5027 .add(*Val) 5028 .addImm(SubReg); 5029 } 5030 MI.eraseFromParent(); 5031 return &MBB; 5032 } 5033 5034 // Control flow needs to be inserted if indexing with a VGPR. 5035 if (Val->isReg()) 5036 MRI.clearKillFlags(Val->getReg()); 5037 5038 const DebugLoc &DL = MI.getDebugLoc(); 5039 5040 Register PhiReg = MRI.createVirtualRegister(VecRC); 5041 5042 Register SGPRIdxReg; 5043 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, 5044 UseGPRIdxMode, SGPRIdxReg); 5045 MachineBasicBlock *LoopBB = InsPt->getParent(); 5046 5047 if (UseGPRIdxMode) { 5048 const MCInstrDesc &GPRIDXDesc = 5049 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 5050 5051 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) 5052 .addReg(PhiReg) 5053 .add(*Val) 5054 .addReg(SGPRIdxReg) 5055 .addImm(SubReg); 5056 } else { 5057 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( 5058 TRI.getRegSizeInBits(*VecRC), 32, false); 5059 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) 5060 .addReg(PhiReg) 5061 .add(*Val) 5062 .addImm(SubReg); 5063 } 5064 5065 MI.eraseFromParent(); 5066 return LoopBB; 5067 } 5068 5069 static uint32_t getIdentityValueForWaveReduction(unsigned Opc) { 5070 switch (Opc) { 5071 case AMDGPU::S_MIN_U32: 5072 return std::numeric_limits<uint32_t>::max(); 5073 case AMDGPU::S_MIN_I32: 5074 return std::numeric_limits<int32_t>::max(); 5075 case AMDGPU::S_MAX_U32: 5076 return std::numeric_limits<uint32_t>::min(); 5077 case AMDGPU::S_MAX_I32: 5078 return std::numeric_limits<int32_t>::min(); 5079 case AMDGPU::S_ADD_I32: 5080 case AMDGPU::S_SUB_I32: 5081 case AMDGPU::S_OR_B32: 5082 case AMDGPU::S_XOR_B32: 5083 return std::numeric_limits<uint32_t>::min(); 5084 case AMDGPU::S_AND_B32: 5085 return std::numeric_limits<uint32_t>::max(); 5086 default: 5087 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction"); 5088 } 5089 } 5090 5091 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, 5092 MachineBasicBlock &BB, 5093 const GCNSubtarget &ST, 5094 unsigned Opc) { 5095 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); 5096 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5097 const DebugLoc &DL = MI.getDebugLoc(); 5098 const SIInstrInfo *TII = ST.getInstrInfo(); 5099 5100 // Reduction operations depend on whether the input operand is SGPR or VGPR. 5101 Register SrcReg = MI.getOperand(1).getReg(); 5102 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); 5103 Register DstReg = MI.getOperand(0).getReg(); 5104 MachineBasicBlock *RetBB = nullptr; 5105 if (isSGPR) { 5106 switch (Opc) { 5107 case AMDGPU::S_MIN_U32: 5108 case AMDGPU::S_MIN_I32: 5109 case AMDGPU::S_MAX_U32: 5110 case AMDGPU::S_MAX_I32: 5111 case AMDGPU::S_AND_B32: 5112 case AMDGPU::S_OR_B32: { 5113 // Idempotent operations. 5114 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); 5115 RetBB = &BB; 5116 break; 5117 } 5118 case AMDGPU::S_XOR_B32: 5119 case AMDGPU::S_ADD_I32: 5120 case AMDGPU::S_SUB_I32: { 5121 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); 5122 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); 5123 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); 5124 Register ActiveLanes = MRI.createVirtualRegister(DstRegClass); 5125 5126 bool IsWave32 = ST.isWave32(); 5127 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 5128 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5129 unsigned CountReg = 5130 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; 5131 5132 auto Exec = 5133 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); 5134 5135 auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) 5136 .addReg(Exec->getOperand(0).getReg()); 5137 5138 switch (Opc) { 5139 case AMDGPU::S_XOR_B32: { 5140 // Performing an XOR operation on a uniform value 5141 // depends on the parity of the number of active lanes. 5142 // For even parity, the result will be 0, for odd 5143 // parity the result will be the same as the input value. 5144 Register ParityRegister = MRI.createVirtualRegister(DstRegClass); 5145 5146 auto ParityReg = 5147 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) 5148 .addReg(NewAccumulator->getOperand(0).getReg()) 5149 .addImm(1); 5150 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) 5151 .addReg(SrcReg) 5152 .addReg(ParityReg->getOperand(0).getReg()); 5153 break; 5154 } 5155 case AMDGPU::S_SUB_I32: { 5156 Register NegatedVal = MRI.createVirtualRegister(DstRegClass); 5157 5158 // Take the negation of the source operand. 5159 auto InvertedValReg = 5160 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal) 5161 .addImm(-1) 5162 .addReg(SrcReg); 5163 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) 5164 .addReg(InvertedValReg->getOperand(0).getReg()) 5165 .addReg(NewAccumulator->getOperand(0).getReg()); 5166 break; 5167 } 5168 case AMDGPU::S_ADD_I32: { 5169 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) 5170 .addReg(SrcReg) 5171 .addReg(NewAccumulator->getOperand(0).getReg()); 5172 break; 5173 } 5174 } 5175 RetBB = &BB; 5176 } 5177 } 5178 } else { 5179 // TODO: Implement DPP Strategy and switch based on immediate strategy 5180 // operand. For now, for all the cases (default, Iterative and DPP we use 5181 // iterative approach by default.) 5182 5183 // To reduce the VGPR using iterative approach, we need to iterate 5184 // over all the active lanes. Lowering consists of ComputeLoop, 5185 // which iterate over only active lanes. We use copy of EXEC register 5186 // as induction variable and every active lane modifies it using bitset0 5187 // so that we will get the next active lane for next iteration. 5188 MachineBasicBlock::iterator I = BB.end(); 5189 Register SrcReg = MI.getOperand(1).getReg(); 5190 5191 // Create Control flow for loop 5192 // Split MI's Machine Basic block into For loop 5193 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); 5194 5195 // Create virtual registers required for lowering. 5196 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); 5197 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); 5198 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); 5199 Register InitalValReg = MRI.createVirtualRegister(DstRegClass); 5200 5201 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); 5202 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); 5203 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); 5204 5205 Register FF1Reg = MRI.createVirtualRegister(DstRegClass); 5206 Register LaneValueReg = 5207 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5208 5209 bool IsWave32 = ST.isWave32(); 5210 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 5211 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5212 5213 // Create initial values of induction variable from Exec, Accumulator and 5214 // insert branch instr to newly created ComputeBlock 5215 uint32_t InitalValue = getIdentityValueForWaveReduction(Opc); 5216 auto TmpSReg = 5217 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); 5218 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) 5219 .addImm(InitalValue); 5220 // clang-format off 5221 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) 5222 .addMBB(ComputeLoop); 5223 // clang-format on 5224 5225 // Start constructing ComputeLoop 5226 I = ComputeLoop->end(); 5227 auto Accumulator = 5228 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) 5229 .addReg(InitalValReg) 5230 .addMBB(&BB); 5231 auto ActiveBits = 5232 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) 5233 .addReg(TmpSReg->getOperand(0).getReg()) 5234 .addMBB(&BB); 5235 5236 // Perform the computations 5237 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; 5238 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) 5239 .addReg(ActiveBits->getOperand(0).getReg()); 5240 auto LaneValue = BuildMI(*ComputeLoop, I, DL, 5241 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) 5242 .addReg(SrcReg) 5243 .addReg(FF1->getOperand(0).getReg()); 5244 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) 5245 .addReg(Accumulator->getOperand(0).getReg()) 5246 .addReg(LaneValue->getOperand(0).getReg()); 5247 5248 // Manipulate the iterator to get the next active lane 5249 unsigned BITSETOpc = 5250 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; 5251 auto NewActiveBits = 5252 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) 5253 .addReg(FF1->getOperand(0).getReg()) 5254 .addReg(ActiveBits->getOperand(0).getReg()); 5255 5256 // Add phi nodes 5257 Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) 5258 .addMBB(ComputeLoop); 5259 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) 5260 .addMBB(ComputeLoop); 5261 5262 // Creating branching 5263 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; 5264 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) 5265 .addReg(NewActiveBits->getOperand(0).getReg()) 5266 .addImm(0); 5267 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 5268 .addMBB(ComputeLoop); 5269 5270 RetBB = ComputeEnd; 5271 } 5272 MI.eraseFromParent(); 5273 return RetBB; 5274 } 5275 5276 MachineBasicBlock * 5277 SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 5278 MachineBasicBlock *BB) const { 5279 5280 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5281 MachineFunction *MF = BB->getParent(); 5282 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 5283 5284 switch (MI.getOpcode()) { 5285 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: 5286 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); 5287 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: 5288 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); 5289 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: 5290 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); 5291 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: 5292 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); 5293 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: 5294 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); 5295 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: 5296 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); 5297 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: 5298 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); 5299 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: 5300 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); 5301 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32: 5302 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32); 5303 case AMDGPU::S_UADDO_PSEUDO: 5304 case AMDGPU::S_USUBO_PSEUDO: { 5305 const DebugLoc &DL = MI.getDebugLoc(); 5306 MachineOperand &Dest0 = MI.getOperand(0); 5307 MachineOperand &Dest1 = MI.getOperand(1); 5308 MachineOperand &Src0 = MI.getOperand(2); 5309 MachineOperand &Src1 = MI.getOperand(3); 5310 5311 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 5312 ? AMDGPU::S_ADD_I32 5313 : AMDGPU::S_SUB_I32; 5314 // clang-format off 5315 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()) 5316 .add(Src0) 5317 .add(Src1); 5318 // clang-format on 5319 5320 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) 5321 .addImm(1) 5322 .addImm(0); 5323 5324 MI.eraseFromParent(); 5325 return BB; 5326 } 5327 case AMDGPU::S_ADD_U64_PSEUDO: 5328 case AMDGPU::S_SUB_U64_PSEUDO: { 5329 // For targets older than GFX12, we emit a sequence of 32-bit operations. 5330 // For GFX12, we emit s_add_u64 and s_sub_u64. 5331 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5332 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5333 const DebugLoc &DL = MI.getDebugLoc(); 5334 MachineOperand &Dest = MI.getOperand(0); 5335 MachineOperand &Src0 = MI.getOperand(1); 5336 MachineOperand &Src1 = MI.getOperand(2); 5337 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5338 if (Subtarget->hasScalarAddSub64()) { 5339 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; 5340 // clang-format off 5341 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) 5342 .add(Src0) 5343 .add(Src1); 5344 // clang-format on 5345 } else { 5346 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5347 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 5348 5349 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5350 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5351 5352 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( 5353 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); 5354 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( 5355 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); 5356 5357 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( 5358 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); 5359 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( 5360 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); 5361 5362 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 5363 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 5364 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 5365 .add(Src0Sub0) 5366 .add(Src1Sub0); 5367 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 5368 .add(Src0Sub1) 5369 .add(Src1Sub1); 5370 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 5371 .addReg(DestSub0) 5372 .addImm(AMDGPU::sub0) 5373 .addReg(DestSub1) 5374 .addImm(AMDGPU::sub1); 5375 } 5376 MI.eraseFromParent(); 5377 return BB; 5378 } 5379 case AMDGPU::V_ADD_U64_PSEUDO: 5380 case AMDGPU::V_SUB_U64_PSEUDO: { 5381 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5382 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5383 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5384 const DebugLoc &DL = MI.getDebugLoc(); 5385 5386 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); 5387 5388 MachineOperand &Dest = MI.getOperand(0); 5389 MachineOperand &Src0 = MI.getOperand(1); 5390 MachineOperand &Src1 = MI.getOperand(2); 5391 5392 if (IsAdd && ST.hasLshlAddU64Inst()) { 5393 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), 5394 Dest.getReg()) 5395 .add(Src0) 5396 .addImm(0) 5397 .add(Src1); 5398 TII->legalizeOperands(*Add); 5399 MI.eraseFromParent(); 5400 return BB; 5401 } 5402 5403 const auto *CarryRC = TRI->getWaveMaskRegClass(); 5404 5405 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5406 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5407 5408 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5409 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5410 5411 const TargetRegisterClass *Src0RC = Src0.isReg() 5412 ? MRI.getRegClass(Src0.getReg()) 5413 : &AMDGPU::VReg_64RegClass; 5414 const TargetRegisterClass *Src1RC = Src1.isReg() 5415 ? MRI.getRegClass(Src1.getReg()) 5416 : &AMDGPU::VReg_64RegClass; 5417 5418 const TargetRegisterClass *Src0SubRC = 5419 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); 5420 const TargetRegisterClass *Src1SubRC = 5421 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); 5422 5423 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( 5424 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 5425 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( 5426 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 5427 5428 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( 5429 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 5430 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( 5431 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 5432 5433 unsigned LoOpc = 5434 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 5435 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 5436 .addReg(CarryReg, RegState::Define) 5437 .add(SrcReg0Sub0) 5438 .add(SrcReg1Sub0) 5439 .addImm(0); // clamp bit 5440 5441 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5442 MachineInstr *HiHalf = 5443 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 5444 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5445 .add(SrcReg0Sub1) 5446 .add(SrcReg1Sub1) 5447 .addReg(CarryReg, RegState::Kill) 5448 .addImm(0); // clamp bit 5449 5450 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 5451 .addReg(DestSub0) 5452 .addImm(AMDGPU::sub0) 5453 .addReg(DestSub1) 5454 .addImm(AMDGPU::sub1); 5455 TII->legalizeOperands(*LoHalf); 5456 TII->legalizeOperands(*HiHalf); 5457 MI.eraseFromParent(); 5458 return BB; 5459 } 5460 case AMDGPU::S_ADD_CO_PSEUDO: 5461 case AMDGPU::S_SUB_CO_PSEUDO: { 5462 // This pseudo has a chance to be selected 5463 // only from uniform add/subcarry node. All the VGPR operands 5464 // therefore assumed to be splat vectors. 5465 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5466 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5467 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5468 MachineBasicBlock::iterator MII = MI; 5469 const DebugLoc &DL = MI.getDebugLoc(); 5470 MachineOperand &Dest = MI.getOperand(0); 5471 MachineOperand &CarryDest = MI.getOperand(1); 5472 MachineOperand &Src0 = MI.getOperand(2); 5473 MachineOperand &Src1 = MI.getOperand(3); 5474 MachineOperand &Src2 = MI.getOperand(4); 5475 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5476 ? AMDGPU::S_ADDC_U32 5477 : AMDGPU::S_SUBB_U32; 5478 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { 5479 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5480 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) 5481 .addReg(Src0.getReg()); 5482 Src0.setReg(RegOp0); 5483 } 5484 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { 5485 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5486 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) 5487 .addReg(Src1.getReg()); 5488 Src1.setReg(RegOp1); 5489 } 5490 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5491 if (TRI->isVectorRegister(MRI, Src2.getReg())) { 5492 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) 5493 .addReg(Src2.getReg()); 5494 Src2.setReg(RegOp2); 5495 } 5496 5497 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); 5498 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); 5499 assert(WaveSize == 64 || WaveSize == 32); 5500 5501 if (WaveSize == 64) { 5502 if (ST.hasScalarCompareEq64()) { 5503 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) 5504 .addReg(Src2.getReg()) 5505 .addImm(0); 5506 } else { 5507 const TargetRegisterClass *SubRC = 5508 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); 5509 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( 5510 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC); 5511 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( 5512 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC); 5513 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5514 5515 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) 5516 .add(Src2Sub0) 5517 .add(Src2Sub1); 5518 5519 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 5520 .addReg(Src2_32, RegState::Kill) 5521 .addImm(0); 5522 } 5523 } else { 5524 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 5525 .addReg(Src2.getReg()) 5526 .addImm(0); 5527 } 5528 5529 // clang-format off 5530 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()) 5531 .add(Src0) 5532 .add(Src1); 5533 // clang-format on 5534 5535 unsigned SelOpc = 5536 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 5537 5538 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) 5539 .addImm(-1) 5540 .addImm(0); 5541 5542 MI.eraseFromParent(); 5543 return BB; 5544 } 5545 case AMDGPU::SI_INIT_M0: { 5546 MachineOperand &M0Init = MI.getOperand(0); 5547 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 5548 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32), 5549 AMDGPU::M0) 5550 .add(M0Init); 5551 MI.eraseFromParent(); 5552 return BB; 5553 } 5554 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: { 5555 // Set SCC to true, in case the barrier instruction gets converted to a NOP. 5556 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 5557 TII->get(AMDGPU::S_CMP_EQ_U32)) 5558 .addImm(0) 5559 .addImm(0); 5560 return BB; 5561 } 5562 case AMDGPU::GET_GROUPSTATICSIZE: { 5563 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || 5564 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); 5565 DebugLoc DL = MI.getDebugLoc(); 5566 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 5567 .add(MI.getOperand(0)) 5568 .addImm(MFI->getLDSSize()); 5569 MI.eraseFromParent(); 5570 return BB; 5571 } 5572 case AMDGPU::GET_SHADERCYCLESHILO: { 5573 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); 5574 MachineRegisterInfo &MRI = MF->getRegInfo(); 5575 const DebugLoc &DL = MI.getDebugLoc(); 5576 // The algorithm is: 5577 // 5578 // hi1 = getreg(SHADER_CYCLES_HI) 5579 // lo1 = getreg(SHADER_CYCLES_LO) 5580 // hi2 = getreg(SHADER_CYCLES_HI) 5581 // 5582 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. 5583 // Otherwise there was overflow and the result is hi2:0. In both cases the 5584 // result should represent the actual time at some point during the sequence 5585 // of three getregs. 5586 using namespace AMDGPU::Hwreg; 5587 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5588 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) 5589 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); 5590 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5591 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) 5592 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32)); 5593 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5594 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) 5595 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); 5596 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) 5597 .addReg(RegHi1) 5598 .addReg(RegHi2); 5599 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5600 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) 5601 .addReg(RegLo1) 5602 .addImm(0); 5603 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) 5604 .add(MI.getOperand(0)) 5605 .addReg(RegLo) 5606 .addImm(AMDGPU::sub0) 5607 .addReg(RegHi2) 5608 .addImm(AMDGPU::sub1); 5609 MI.eraseFromParent(); 5610 return BB; 5611 } 5612 case AMDGPU::SI_INDIRECT_SRC_V1: 5613 case AMDGPU::SI_INDIRECT_SRC_V2: 5614 case AMDGPU::SI_INDIRECT_SRC_V4: 5615 case AMDGPU::SI_INDIRECT_SRC_V8: 5616 case AMDGPU::SI_INDIRECT_SRC_V9: 5617 case AMDGPU::SI_INDIRECT_SRC_V10: 5618 case AMDGPU::SI_INDIRECT_SRC_V11: 5619 case AMDGPU::SI_INDIRECT_SRC_V12: 5620 case AMDGPU::SI_INDIRECT_SRC_V16: 5621 case AMDGPU::SI_INDIRECT_SRC_V32: 5622 return emitIndirectSrc(MI, *BB, *getSubtarget()); 5623 case AMDGPU::SI_INDIRECT_DST_V1: 5624 case AMDGPU::SI_INDIRECT_DST_V2: 5625 case AMDGPU::SI_INDIRECT_DST_V4: 5626 case AMDGPU::SI_INDIRECT_DST_V8: 5627 case AMDGPU::SI_INDIRECT_DST_V9: 5628 case AMDGPU::SI_INDIRECT_DST_V10: 5629 case AMDGPU::SI_INDIRECT_DST_V11: 5630 case AMDGPU::SI_INDIRECT_DST_V12: 5631 case AMDGPU::SI_INDIRECT_DST_V16: 5632 case AMDGPU::SI_INDIRECT_DST_V32: 5633 return emitIndirectDst(MI, *BB, *getSubtarget()); 5634 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 5635 case AMDGPU::SI_KILL_I1_PSEUDO: 5636 return splitKillBlock(MI, BB); 5637 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 5638 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5639 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5640 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5641 5642 Register Dst = MI.getOperand(0).getReg(); 5643 const MachineOperand &Src0 = MI.getOperand(1); 5644 const MachineOperand &Src1 = MI.getOperand(2); 5645 const DebugLoc &DL = MI.getDebugLoc(); 5646 Register SrcCond = MI.getOperand(3).getReg(); 5647 5648 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5649 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5650 const auto *CondRC = TRI->getWaveMaskRegClass(); 5651 Register SrcCondCopy = MRI.createVirtualRegister(CondRC); 5652 5653 const TargetRegisterClass *Src0RC = Src0.isReg() 5654 ? MRI.getRegClass(Src0.getReg()) 5655 : &AMDGPU::VReg_64RegClass; 5656 const TargetRegisterClass *Src1RC = Src1.isReg() 5657 ? MRI.getRegClass(Src1.getReg()) 5658 : &AMDGPU::VReg_64RegClass; 5659 5660 const TargetRegisterClass *Src0SubRC = 5661 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); 5662 const TargetRegisterClass *Src1SubRC = 5663 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); 5664 5665 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( 5666 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 5667 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( 5668 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 5669 5670 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( 5671 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 5672 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( 5673 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 5674 5675 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond); 5676 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 5677 .addImm(0) 5678 .add(Src0Sub0) 5679 .addImm(0) 5680 .add(Src1Sub0) 5681 .addReg(SrcCondCopy); 5682 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 5683 .addImm(0) 5684 .add(Src0Sub1) 5685 .addImm(0) 5686 .add(Src1Sub1) 5687 .addReg(SrcCondCopy); 5688 5689 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 5690 .addReg(DstLo) 5691 .addImm(AMDGPU::sub0) 5692 .addReg(DstHi) 5693 .addImm(AMDGPU::sub1); 5694 MI.eraseFromParent(); 5695 return BB; 5696 } 5697 case AMDGPU::SI_BR_UNDEF: { 5698 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5699 const DebugLoc &DL = MI.getDebugLoc(); 5700 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 5701 .add(MI.getOperand(0)); 5702 Br->getOperand(1).setIsUndef(); // read undef SCC 5703 MI.eraseFromParent(); 5704 return BB; 5705 } 5706 case AMDGPU::ADJCALLSTACKUP: 5707 case AMDGPU::ADJCALLSTACKDOWN: { 5708 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5709 MachineInstrBuilder MIB(*MF, &MI); 5710 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) 5711 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); 5712 return BB; 5713 } 5714 case AMDGPU::SI_CALL_ISEL: { 5715 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5716 const DebugLoc &DL = MI.getDebugLoc(); 5717 5718 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); 5719 5720 MachineInstrBuilder MIB; 5721 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); 5722 5723 for (const MachineOperand &MO : MI.operands()) 5724 MIB.add(MO); 5725 5726 MIB.cloneMemRefs(MI); 5727 MI.eraseFromParent(); 5728 return BB; 5729 } 5730 case AMDGPU::V_ADD_CO_U32_e32: 5731 case AMDGPU::V_SUB_CO_U32_e32: 5732 case AMDGPU::V_SUBREV_CO_U32_e32: { 5733 // TODO: Define distinct V_*_I32_Pseudo instructions instead. 5734 const DebugLoc &DL = MI.getDebugLoc(); 5735 unsigned Opc = MI.getOpcode(); 5736 5737 bool NeedClampOperand = false; 5738 if (TII->pseudoToMCOpcode(Opc) == -1) { 5739 Opc = AMDGPU::getVOPe64(Opc); 5740 NeedClampOperand = true; 5741 } 5742 5743 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); 5744 if (TII->isVOP3(*I)) { 5745 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5746 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5747 I.addReg(TRI->getVCC(), RegState::Define); 5748 } 5749 I.add(MI.getOperand(1)).add(MI.getOperand(2)); 5750 if (NeedClampOperand) 5751 I.addImm(0); // clamp bit for e64 encoding 5752 5753 TII->legalizeOperands(*I); 5754 5755 MI.eraseFromParent(); 5756 return BB; 5757 } 5758 case AMDGPU::V_ADDC_U32_e32: 5759 case AMDGPU::V_SUBB_U32_e32: 5760 case AMDGPU::V_SUBBREV_U32_e32: 5761 // These instructions have an implicit use of vcc which counts towards the 5762 // constant bus limit. 5763 TII->legalizeOperands(MI); 5764 return BB; 5765 case AMDGPU::DS_GWS_INIT: 5766 case AMDGPU::DS_GWS_SEMA_BR: 5767 case AMDGPU::DS_GWS_BARRIER: 5768 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); 5769 [[fallthrough]]; 5770 case AMDGPU::DS_GWS_SEMA_V: 5771 case AMDGPU::DS_GWS_SEMA_P: 5772 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: 5773 // A s_waitcnt 0 is required to be the instruction immediately following. 5774 if (getSubtarget()->hasGWSAutoReplay()) { 5775 bundleInstWithWaitcnt(MI); 5776 return BB; 5777 } 5778 5779 return emitGWSMemViolTestLoop(MI, BB); 5780 case AMDGPU::S_SETREG_B32: { 5781 // Try to optimize cases that only set the denormal mode or rounding mode. 5782 // 5783 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or 5784 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode 5785 // instead. 5786 // 5787 // FIXME: This could be predicates on the immediate, but tablegen doesn't 5788 // allow you to have a no side effect instruction in the output of a 5789 // sideeffecting pattern. 5790 auto [ID, Offset, Width] = 5791 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm()); 5792 if (ID != AMDGPU::Hwreg::ID_MODE) 5793 return BB; 5794 5795 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); 5796 const unsigned SetMask = WidthMask << Offset; 5797 5798 if (getSubtarget()->hasDenormModeInst()) { 5799 unsigned SetDenormOp = 0; 5800 unsigned SetRoundOp = 0; 5801 5802 // The dedicated instructions can only set the whole denorm or round mode 5803 // at once, not a subset of bits in either. 5804 if (SetMask == 5805 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { 5806 // If this fully sets both the round and denorm mode, emit the two 5807 // dedicated instructions for these. 5808 SetRoundOp = AMDGPU::S_ROUND_MODE; 5809 SetDenormOp = AMDGPU::S_DENORM_MODE; 5810 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { 5811 SetRoundOp = AMDGPU::S_ROUND_MODE; 5812 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { 5813 SetDenormOp = AMDGPU::S_DENORM_MODE; 5814 } 5815 5816 if (SetRoundOp || SetDenormOp) { 5817 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5818 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); 5819 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { 5820 unsigned ImmVal = Def->getOperand(1).getImm(); 5821 if (SetRoundOp) { 5822 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) 5823 .addImm(ImmVal & 0xf); 5824 5825 // If we also have the denorm mode, get just the denorm mode bits. 5826 ImmVal >>= 4; 5827 } 5828 5829 if (SetDenormOp) { 5830 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) 5831 .addImm(ImmVal & 0xf); 5832 } 5833 5834 MI.eraseFromParent(); 5835 return BB; 5836 } 5837 } 5838 } 5839 5840 // If only FP bits are touched, used the no side effects pseudo. 5841 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | 5842 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) 5843 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); 5844 5845 return BB; 5846 } 5847 case AMDGPU::S_INVERSE_BALLOT_U32: 5848 case AMDGPU::S_INVERSE_BALLOT_U64: 5849 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if 5850 // necessary. After that they are equivalent to a COPY. 5851 MI.setDesc(TII->get(AMDGPU::COPY)); 5852 return BB; 5853 case AMDGPU::ENDPGM_TRAP: { 5854 const DebugLoc &DL = MI.getDebugLoc(); 5855 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { 5856 MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); 5857 MI.addOperand(MachineOperand::CreateImm(0)); 5858 return BB; 5859 } 5860 5861 // We need a block split to make the real endpgm a terminator. We also don't 5862 // want to break phis in successor blocks, so we can't just delete to the 5863 // end of the block. 5864 5865 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); 5866 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 5867 MF->push_back(TrapBB); 5868 // clang-format off 5869 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM)) 5870 .addImm(0); 5871 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 5872 .addMBB(TrapBB); 5873 // clang-format on 5874 5875 BB->addSuccessor(TrapBB); 5876 MI.eraseFromParent(); 5877 return SplitBB; 5878 } 5879 case AMDGPU::SIMULATED_TRAP: { 5880 assert(Subtarget->hasPrivEnabledTrap2NopBug()); 5881 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5882 MachineBasicBlock *SplitBB = 5883 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); 5884 MI.eraseFromParent(); 5885 return SplitBB; 5886 } 5887 default: 5888 if (TII->isImage(MI) || TII->isMUBUF(MI)) { 5889 if (!MI.mayStore()) 5890 AddMemOpInit(MI); 5891 return BB; 5892 } 5893 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 5894 } 5895 } 5896 5897 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 5898 // This currently forces unfolding various combinations of fsub into fma with 5899 // free fneg'd operands. As long as we have fast FMA (controlled by 5900 // isFMAFasterThanFMulAndFAdd), we should perform these. 5901 5902 // When fma is quarter rate, for f64 where add / sub are at best half rate, 5903 // most of these combines appear to be cycle neutral but save on instruction 5904 // count / code size. 5905 return true; 5906 } 5907 5908 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } 5909 5910 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 5911 EVT VT) const { 5912 if (!VT.isVector()) { 5913 return MVT::i1; 5914 } 5915 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 5916 } 5917 5918 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { 5919 // TODO: Should i16 be used always if legal? For now it would force VALU 5920 // shifts. 5921 return (VT == MVT::i16) ? MVT::i16 : MVT::i32; 5922 } 5923 5924 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { 5925 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) 5926 ? Ty.changeElementSize(16) 5927 : Ty.changeElementSize(32); 5928 } 5929 5930 // Answering this is somewhat tricky and depends on the specific device which 5931 // have different rates for fma or all f64 operations. 5932 // 5933 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 5934 // regardless of which device (although the number of cycles differs between 5935 // devices), so it is always profitable for f64. 5936 // 5937 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 5938 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 5939 // which we can always do even without fused FP ops since it returns the same 5940 // result as the separate operations and since it is always full 5941 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 5942 // however does not support denormals, so we do report fma as faster if we have 5943 // a fast fma device and require denormals. 5944 // 5945 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 5946 EVT VT) const { 5947 VT = VT.getScalarType(); 5948 5949 switch (VT.getSimpleVT().SimpleTy) { 5950 case MVT::f32: { 5951 // If mad is not available this depends only on if f32 fma is full rate. 5952 if (!Subtarget->hasMadMacF32Insts()) 5953 return Subtarget->hasFastFMAF32(); 5954 5955 // Otherwise f32 mad is always full rate and returns the same result as 5956 // the separate operations so should be preferred over fma. 5957 // However does not support denormals. 5958 if (!denormalModeIsFlushAllF32(MF)) 5959 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); 5960 5961 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. 5962 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); 5963 } 5964 case MVT::f64: 5965 return true; 5966 case MVT::f16: 5967 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); 5968 default: 5969 break; 5970 } 5971 5972 return false; 5973 } 5974 5975 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 5976 LLT Ty) const { 5977 switch (Ty.getScalarSizeInBits()) { 5978 case 16: 5979 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16); 5980 case 32: 5981 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32); 5982 case 64: 5983 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64); 5984 default: 5985 break; 5986 } 5987 5988 return false; 5989 } 5990 5991 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { 5992 if (!Ty.isScalar()) 5993 return false; 5994 5995 if (Ty.getScalarSizeInBits() == 16) 5996 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF()); 5997 if (Ty.getScalarSizeInBits() == 32) 5998 return Subtarget->hasMadMacF32Insts() && 5999 denormalModeIsFlushAllF32(*MI.getMF()); 6000 6001 return false; 6002 } 6003 6004 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, 6005 const SDNode *N) const { 6006 // TODO: Check future ftz flag 6007 // v_mad_f32/v_mac_f32 do not support denormals. 6008 EVT VT = N->getValueType(0); 6009 if (VT == MVT::f32) 6010 return Subtarget->hasMadMacF32Insts() && 6011 denormalModeIsFlushAllF32(DAG.getMachineFunction()); 6012 if (VT == MVT::f16) { 6013 return Subtarget->hasMadF16() && 6014 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); 6015 } 6016 6017 return false; 6018 } 6019 6020 //===----------------------------------------------------------------------===// 6021 // Custom DAG Lowering Operations 6022 //===----------------------------------------------------------------------===// 6023 6024 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the 6025 // wider vector type is legal. 6026 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, 6027 SelectionDAG &DAG) const { 6028 unsigned Opc = Op.getOpcode(); 6029 EVT VT = Op.getValueType(); 6030 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || 6031 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || 6032 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 6033 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); 6034 6035 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); 6036 6037 SDLoc SL(Op); 6038 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags()); 6039 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags()); 6040 6041 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 6042 } 6043 6044 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the 6045 // wider vector type is legal. 6046 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, 6047 SelectionDAG &DAG) const { 6048 unsigned Opc = Op.getOpcode(); 6049 EVT VT = Op.getValueType(); 6050 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || 6051 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || 6052 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || 6053 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 6054 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || 6055 VT == MVT::v32bf16); 6056 6057 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); 6058 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); 6059 6060 SDLoc SL(Op); 6061 6062 SDValue OpLo = 6063 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags()); 6064 SDValue OpHi = 6065 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags()); 6066 6067 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 6068 } 6069 6070 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, 6071 SelectionDAG &DAG) const { 6072 unsigned Opc = Op.getOpcode(); 6073 EVT VT = Op.getValueType(); 6074 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || 6075 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || 6076 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 6077 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || 6078 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || 6079 VT == MVT::v32bf16); 6080 6081 SDValue Op0 = Op.getOperand(0); 6082 auto [Lo0, Hi0] = Op0.getValueType().isVector() 6083 ? DAG.SplitVectorOperand(Op.getNode(), 0) 6084 : std::pair(Op0, Op0); 6085 6086 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); 6087 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2); 6088 6089 SDLoc SL(Op); 6090 auto ResVT = DAG.GetSplitDestVTs(VT); 6091 6092 SDValue OpLo = 6093 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags()); 6094 SDValue OpHi = 6095 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags()); 6096 6097 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 6098 } 6099 6100 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 6101 switch (Op.getOpcode()) { 6102 default: 6103 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 6104 case ISD::BRCOND: 6105 return LowerBRCOND(Op, DAG); 6106 case ISD::RETURNADDR: 6107 return LowerRETURNADDR(Op, DAG); 6108 case ISD::LOAD: { 6109 SDValue Result = LowerLOAD(Op, DAG); 6110 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) && 6111 "Load should return a value and a chain"); 6112 return Result; 6113 } 6114 case ISD::FSQRT: { 6115 EVT VT = Op.getValueType(); 6116 if (VT == MVT::f32) 6117 return lowerFSQRTF32(Op, DAG); 6118 if (VT == MVT::f64) 6119 return lowerFSQRTF64(Op, DAG); 6120 return SDValue(); 6121 } 6122 case ISD::FSIN: 6123 case ISD::FCOS: 6124 return LowerTrig(Op, DAG); 6125 case ISD::SELECT: 6126 return LowerSELECT(Op, DAG); 6127 case ISD::FDIV: 6128 return LowerFDIV(Op, DAG); 6129 case ISD::FFREXP: 6130 return LowerFFREXP(Op, DAG); 6131 case ISD::ATOMIC_CMP_SWAP: 6132 return LowerATOMIC_CMP_SWAP(Op, DAG); 6133 case ISD::STORE: 6134 return LowerSTORE(Op, DAG); 6135 case ISD::GlobalAddress: { 6136 MachineFunction &MF = DAG.getMachineFunction(); 6137 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 6138 return LowerGlobalAddress(MFI, Op, DAG); 6139 } 6140 case ISD::INTRINSIC_WO_CHAIN: 6141 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6142 case ISD::INTRINSIC_W_CHAIN: 6143 return LowerINTRINSIC_W_CHAIN(Op, DAG); 6144 case ISD::INTRINSIC_VOID: 6145 return LowerINTRINSIC_VOID(Op, DAG); 6146 case ISD::ADDRSPACECAST: 6147 return lowerADDRSPACECAST(Op, DAG); 6148 case ISD::INSERT_SUBVECTOR: 6149 return lowerINSERT_SUBVECTOR(Op, DAG); 6150 case ISD::INSERT_VECTOR_ELT: 6151 return lowerINSERT_VECTOR_ELT(Op, DAG); 6152 case ISD::EXTRACT_VECTOR_ELT: 6153 return lowerEXTRACT_VECTOR_ELT(Op, DAG); 6154 case ISD::VECTOR_SHUFFLE: 6155 return lowerVECTOR_SHUFFLE(Op, DAG); 6156 case ISD::SCALAR_TO_VECTOR: 6157 return lowerSCALAR_TO_VECTOR(Op, DAG); 6158 case ISD::BUILD_VECTOR: 6159 return lowerBUILD_VECTOR(Op, DAG); 6160 case ISD::FP_ROUND: 6161 case ISD::STRICT_FP_ROUND: 6162 return lowerFP_ROUND(Op, DAG); 6163 case ISD::TRAP: 6164 return lowerTRAP(Op, DAG); 6165 case ISD::DEBUGTRAP: 6166 return lowerDEBUGTRAP(Op, DAG); 6167 case ISD::ABS: 6168 case ISD::FABS: 6169 case ISD::FNEG: 6170 case ISD::FCANONICALIZE: 6171 case ISD::BSWAP: 6172 return splitUnaryVectorOp(Op, DAG); 6173 case ISD::FMINNUM: 6174 case ISD::FMAXNUM: 6175 return lowerFMINNUM_FMAXNUM(Op, DAG); 6176 case ISD::FMINIMUMNUM: 6177 case ISD::FMAXIMUMNUM: 6178 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG); 6179 case ISD::FMINIMUM: 6180 case ISD::FMAXIMUM: 6181 return lowerFMINIMUM_FMAXIMUM(Op, DAG); 6182 case ISD::FLDEXP: 6183 case ISD::STRICT_FLDEXP: 6184 return lowerFLDEXP(Op, DAG); 6185 case ISD::FMA: 6186 return splitTernaryVectorOp(Op, DAG); 6187 case ISD::FP_TO_SINT: 6188 case ISD::FP_TO_UINT: 6189 return LowerFP_TO_INT(Op, DAG); 6190 case ISD::SHL: 6191 case ISD::SRA: 6192 case ISD::SRL: 6193 case ISD::ADD: 6194 case ISD::SUB: 6195 case ISD::SMIN: 6196 case ISD::SMAX: 6197 case ISD::UMIN: 6198 case ISD::UMAX: 6199 case ISD::FADD: 6200 case ISD::FMUL: 6201 case ISD::FMINNUM_IEEE: 6202 case ISD::FMAXNUM_IEEE: 6203 case ISD::UADDSAT: 6204 case ISD::USUBSAT: 6205 case ISD::SADDSAT: 6206 case ISD::SSUBSAT: 6207 return splitBinaryVectorOp(Op, DAG); 6208 case ISD::FCOPYSIGN: 6209 return lowerFCOPYSIGN(Op, DAG); 6210 case ISD::MUL: 6211 return lowerMUL(Op, DAG); 6212 case ISD::SMULO: 6213 case ISD::UMULO: 6214 return lowerXMULO(Op, DAG); 6215 case ISD::SMUL_LOHI: 6216 case ISD::UMUL_LOHI: 6217 return lowerXMUL_LOHI(Op, DAG); 6218 case ISD::DYNAMIC_STACKALLOC: 6219 return LowerDYNAMIC_STACKALLOC(Op, DAG); 6220 case ISD::STACKSAVE: 6221 return LowerSTACKSAVE(Op, DAG); 6222 case ISD::GET_ROUNDING: 6223 return lowerGET_ROUNDING(Op, DAG); 6224 case ISD::SET_ROUNDING: 6225 return lowerSET_ROUNDING(Op, DAG); 6226 case ISD::PREFETCH: 6227 return lowerPREFETCH(Op, DAG); 6228 case ISD::FP_EXTEND: 6229 case ISD::STRICT_FP_EXTEND: 6230 return lowerFP_EXTEND(Op, DAG); 6231 case ISD::GET_FPENV: 6232 return lowerGET_FPENV(Op, DAG); 6233 case ISD::SET_FPENV: 6234 return lowerSET_FPENV(Op, DAG); 6235 } 6236 return SDValue(); 6237 } 6238 6239 // Used for D16: Casts the result of an instruction into the right vector, 6240 // packs values if loads return unpacked values. 6241 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, 6242 const SDLoc &DL, SelectionDAG &DAG, 6243 bool Unpacked) { 6244 if (!LoadVT.isVector()) 6245 return Result; 6246 6247 // Cast back to the original packed type or to a larger type that is a 6248 // multiple of 32 bit for D16. Widening the return type is a required for 6249 // legalization. 6250 EVT FittingLoadVT = LoadVT; 6251 if ((LoadVT.getVectorNumElements() % 2) == 1) { 6252 FittingLoadVT = 6253 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), 6254 LoadVT.getVectorNumElements() + 1); 6255 } 6256 6257 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. 6258 // Truncate to v2i16/v4i16. 6259 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); 6260 6261 // Workaround legalizer not scalarizing truncate after vector op 6262 // legalization but not creating intermediate vector trunc. 6263 SmallVector<SDValue, 4> Elts; 6264 DAG.ExtractVectorElements(Result, Elts); 6265 for (SDValue &Elt : Elts) 6266 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); 6267 6268 // Pad illegal v1i16/v3fi6 to v4i16 6269 if ((LoadVT.getVectorNumElements() % 2) == 1) 6270 Elts.push_back(DAG.getPOISON(MVT::i16)); 6271 6272 Result = DAG.getBuildVector(IntLoadVT, DL, Elts); 6273 6274 // Bitcast to original type (v2f16/v4f16). 6275 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); 6276 } 6277 6278 // Cast back to the original packed type. 6279 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); 6280 } 6281 6282 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M, 6283 SelectionDAG &DAG, 6284 ArrayRef<SDValue> Ops, 6285 bool IsIntrinsic) const { 6286 SDLoc DL(M); 6287 6288 bool Unpacked = Subtarget->hasUnpackedD16VMem(); 6289 EVT LoadVT = M->getValueType(0); 6290 6291 EVT EquivLoadVT = LoadVT; 6292 if (LoadVT.isVector()) { 6293 if (Unpacked) { 6294 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 6295 LoadVT.getVectorNumElements()); 6296 } else if ((LoadVT.getVectorNumElements() % 2) == 1) { 6297 // Widen v3f16 to legal type 6298 EquivLoadVT = 6299 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), 6300 LoadVT.getVectorNumElements() + 1); 6301 } 6302 } 6303 6304 // Change from v4f16/v2f16 to EquivLoadVT. 6305 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); 6306 6307 SDValue Load = DAG.getMemIntrinsicNode( 6308 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops, 6309 M->getMemoryVT(), M->getMemOperand()); 6310 6311 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); 6312 6313 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL); 6314 } 6315 6316 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, 6317 SelectionDAG &DAG, 6318 ArrayRef<SDValue> Ops) const { 6319 SDLoc DL(M); 6320 EVT LoadVT = M->getValueType(0); 6321 EVT EltType = LoadVT.getScalarType(); 6322 EVT IntVT = LoadVT.changeTypeToInteger(); 6323 6324 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 6325 6326 assert(M->getNumValues() == 2 || M->getNumValues() == 3); 6327 bool IsTFE = M->getNumValues() == 3; 6328 6329 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE 6330 : AMDGPUISD::BUFFER_LOAD_FORMAT) 6331 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE 6332 : AMDGPUISD::BUFFER_LOAD; 6333 6334 if (IsD16) { 6335 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); 6336 } 6337 6338 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics 6339 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) 6340 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(), 6341 IsTFE); 6342 6343 if (isTypeLegal(LoadVT)) { 6344 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, 6345 M->getMemOperand(), DAG); 6346 } 6347 6348 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); 6349 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); 6350 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, 6351 M->getMemOperand(), DAG); 6352 return DAG.getMergeValues( 6353 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, 6354 DL); 6355 } 6356 6357 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, 6358 SelectionDAG &DAG) { 6359 EVT VT = N->getValueType(0); 6360 unsigned CondCode = N->getConstantOperandVal(3); 6361 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode))) 6362 return DAG.getPOISON(VT); 6363 6364 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 6365 6366 SDValue LHS = N->getOperand(1); 6367 SDValue RHS = N->getOperand(2); 6368 6369 SDLoc DL(N); 6370 6371 EVT CmpVT = LHS.getValueType(); 6372 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) { 6373 unsigned PromoteOp = 6374 ICmpInst::isSigned(IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6375 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS); 6376 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS); 6377 } 6378 6379 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 6380 6381 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); 6382 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); 6383 6384 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS, 6385 DAG.getCondCode(CCOpcode)); 6386 if (VT.bitsEq(CCVT)) 6387 return SetCC; 6388 return DAG.getZExtOrTrunc(SetCC, DL, VT); 6389 } 6390 6391 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, 6392 SelectionDAG &DAG) { 6393 EVT VT = N->getValueType(0); 6394 6395 unsigned CondCode = N->getConstantOperandVal(3); 6396 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode))) 6397 return DAG.getPOISON(VT); 6398 6399 SDValue Src0 = N->getOperand(1); 6400 SDValue Src1 = N->getOperand(2); 6401 EVT CmpVT = Src0.getValueType(); 6402 SDLoc SL(N); 6403 6404 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) { 6405 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); 6406 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); 6407 } 6408 6409 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 6410 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 6411 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); 6412 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); 6413 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1, 6414 DAG.getCondCode(CCOpcode)); 6415 if (VT.bitsEq(CCVT)) 6416 return SetCC; 6417 return DAG.getZExtOrTrunc(SetCC, SL, VT); 6418 } 6419 6420 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, 6421 SelectionDAG &DAG) { 6422 EVT VT = N->getValueType(0); 6423 SDValue Src = N->getOperand(1); 6424 SDLoc SL(N); 6425 6426 if (Src.getOpcode() == ISD::SETCC) { 6427 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) 6428 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), 6429 Src.getOperand(1), Src.getOperand(2)); 6430 } 6431 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { 6432 // (ballot 0) -> 0 6433 if (Arg->isZero()) 6434 return DAG.getConstant(0, SL, VT); 6435 6436 // (ballot 1) -> EXEC/EXEC_LO 6437 if (Arg->isOne()) { 6438 Register Exec; 6439 if (VT.getScalarSizeInBits() == 32) 6440 Exec = AMDGPU::EXEC_LO; 6441 else if (VT.getScalarSizeInBits() == 64) 6442 Exec = AMDGPU::EXEC; 6443 else 6444 return SDValue(); 6445 6446 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT); 6447 } 6448 } 6449 6450 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) 6451 // ISD::SETNE) 6452 return DAG.getNode( 6453 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32), 6454 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); 6455 } 6456 6457 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, 6458 SelectionDAG &DAG) { 6459 EVT VT = N->getValueType(0); 6460 unsigned ValSize = VT.getSizeInBits(); 6461 unsigned IID = N->getConstantOperandVal(0); 6462 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || 6463 IID == Intrinsic::amdgcn_permlanex16; 6464 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || 6465 IID == Intrinsic::amdgcn_set_inactive_chain_arg; 6466 SDLoc SL(N); 6467 MVT IntVT = MVT::getIntegerVT(ValSize); 6468 const GCNSubtarget *ST = TLI.getSubtarget(); 6469 unsigned SplitSize = 32; 6470 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) && 6471 ST->hasDPALU_DPP() && 6472 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3))) 6473 SplitSize = 64; 6474 6475 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, 6476 SDValue Src2, MVT ValT) -> SDValue { 6477 SmallVector<SDValue, 8> Operands; 6478 switch (IID) { 6479 case Intrinsic::amdgcn_permlane16: 6480 case Intrinsic::amdgcn_permlanex16: 6481 case Intrinsic::amdgcn_update_dpp: 6482 Operands.push_back(N->getOperand(6)); 6483 Operands.push_back(N->getOperand(5)); 6484 Operands.push_back(N->getOperand(4)); 6485 [[fallthrough]]; 6486 case Intrinsic::amdgcn_writelane: 6487 Operands.push_back(Src2); 6488 [[fallthrough]]; 6489 case Intrinsic::amdgcn_readlane: 6490 case Intrinsic::amdgcn_set_inactive: 6491 case Intrinsic::amdgcn_set_inactive_chain_arg: 6492 case Intrinsic::amdgcn_mov_dpp8: 6493 Operands.push_back(Src1); 6494 [[fallthrough]]; 6495 case Intrinsic::amdgcn_readfirstlane: 6496 case Intrinsic::amdgcn_permlane64: 6497 Operands.push_back(Src0); 6498 break; 6499 default: 6500 llvm_unreachable("unhandled lane op"); 6501 } 6502 6503 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32)); 6504 std::reverse(Operands.begin(), Operands.end()); 6505 6506 if (SDNode *GL = N->getGluedNode()) { 6507 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 6508 GL = GL->getOperand(0).getNode(); 6509 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, 6510 SDValue(GL, 0))); 6511 } 6512 6513 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands); 6514 }; 6515 6516 SDValue Src0 = N->getOperand(1); 6517 SDValue Src1, Src2; 6518 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || 6519 IID == Intrinsic::amdgcn_mov_dpp8 || 6520 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { 6521 Src1 = N->getOperand(2); 6522 if (IID == Intrinsic::amdgcn_writelane || 6523 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16) 6524 Src2 = N->getOperand(3); 6525 } 6526 6527 if (ValSize == SplitSize) { 6528 // Already legal 6529 return SDValue(); 6530 } 6531 6532 if (ValSize < 32) { 6533 bool IsFloat = VT.isFloatingPoint(); 6534 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, 6535 SL, MVT::i32); 6536 6537 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { 6538 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, 6539 SL, MVT::i32); 6540 } 6541 6542 if (IID == Intrinsic::amdgcn_writelane) { 6543 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, 6544 SL, MVT::i32); 6545 } 6546 6547 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); 6548 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); 6549 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; 6550 } 6551 6552 if (ValSize % SplitSize != 0) 6553 return SDValue(); 6554 6555 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { 6556 EVT VT = N->getValueType(0); 6557 unsigned NE = VT.getVectorNumElements(); 6558 EVT EltVT = VT.getVectorElementType(); 6559 SmallVector<SDValue, 8> Scalars; 6560 unsigned NumOperands = N->getNumOperands(); 6561 SmallVector<SDValue, 4> Operands(NumOperands); 6562 SDNode *GL = N->getGluedNode(); 6563 6564 // only handle convergencectrl_glue 6565 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 6566 6567 for (unsigned i = 0; i != NE; ++i) { 6568 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; 6569 ++j) { 6570 SDValue Operand = N->getOperand(j); 6571 EVT OperandVT = Operand.getValueType(); 6572 if (OperandVT.isVector()) { 6573 // A vector operand; extract a single element. 6574 EVT OperandEltVT = OperandVT.getVectorElementType(); 6575 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT, 6576 Operand, DAG.getVectorIdxConstant(i, SL)); 6577 } else { 6578 // A scalar operand; just use it as is. 6579 Operands[j] = Operand; 6580 } 6581 } 6582 6583 if (GL) 6584 Operands[NumOperands - 1] = 6585 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, 6586 SDValue(GL->getOperand(0).getNode(), 0)); 6587 6588 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); 6589 } 6590 6591 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE); 6592 return DAG.getBuildVector(VecVT, SL, Scalars); 6593 }; 6594 6595 if (VT.isVector()) { 6596 switch (MVT::SimpleValueType EltTy = 6597 VT.getVectorElementType().getSimpleVT().SimpleTy) { 6598 case MVT::i32: 6599 case MVT::f32: 6600 if (SplitSize == 32) { 6601 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); 6602 return unrollLaneOp(LaneOp.getNode()); 6603 } 6604 [[fallthrough]]; 6605 case MVT::i16: 6606 case MVT::f16: 6607 case MVT::bf16: { 6608 unsigned SubVecNumElt = 6609 SplitSize / VT.getVectorElementType().getSizeInBits(); 6610 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt); 6611 SmallVector<SDValue, 4> Pieces; 6612 SDValue Src0SubVec, Src1SubVec, Src2SubVec; 6613 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) { 6614 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, 6615 DAG.getConstant(EltIdx, SL, MVT::i32)); 6616 6617 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || 6618 IsPermLane16) 6619 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, 6620 DAG.getConstant(EltIdx, SL, MVT::i32)); 6621 6622 if (IID == Intrinsic::amdgcn_writelane) 6623 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, 6624 DAG.getConstant(EltIdx, SL, MVT::i32)); 6625 6626 Pieces.push_back( 6627 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 6628 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) 6629 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); 6630 EltIdx += SubVecNumElt; 6631 } 6632 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); 6633 } 6634 default: 6635 // Handle all other cases by bitcasting to i32 vectors 6636 break; 6637 } 6638 } 6639 6640 MVT VecVT = 6641 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize); 6642 Src0 = DAG.getBitcast(VecVT, Src0); 6643 6644 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 6645 Src1 = DAG.getBitcast(VecVT, Src1); 6646 6647 if (IID == Intrinsic::amdgcn_writelane) 6648 Src2 = DAG.getBitcast(VecVT, Src2); 6649 6650 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); 6651 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); 6652 return DAG.getBitcast(VT, UnrolledLaneOp); 6653 } 6654 6655 void SITargetLowering::ReplaceNodeResults(SDNode *N, 6656 SmallVectorImpl<SDValue> &Results, 6657 SelectionDAG &DAG) const { 6658 switch (N->getOpcode()) { 6659 case ISD::INSERT_VECTOR_ELT: { 6660 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) 6661 Results.push_back(Res); 6662 return; 6663 } 6664 case ISD::EXTRACT_VECTOR_ELT: { 6665 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) 6666 Results.push_back(Res); 6667 return; 6668 } 6669 case ISD::INTRINSIC_WO_CHAIN: { 6670 unsigned IID = N->getConstantOperandVal(0); 6671 switch (IID) { 6672 case Intrinsic::amdgcn_make_buffer_rsrc: 6673 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG)); 6674 return; 6675 case Intrinsic::amdgcn_cvt_pkrtz: { 6676 SDValue Src0 = N->getOperand(1); 6677 SDValue Src1 = N->getOperand(2); 6678 SDLoc SL(N); 6679 SDValue Cvt = 6680 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1); 6681 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); 6682 return; 6683 } 6684 case Intrinsic::amdgcn_cvt_pknorm_i16: 6685 case Intrinsic::amdgcn_cvt_pknorm_u16: 6686 case Intrinsic::amdgcn_cvt_pk_i16: 6687 case Intrinsic::amdgcn_cvt_pk_u16: { 6688 SDValue Src0 = N->getOperand(1); 6689 SDValue Src1 = N->getOperand(2); 6690 SDLoc SL(N); 6691 unsigned Opcode; 6692 6693 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) 6694 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 6695 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) 6696 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 6697 else if (IID == Intrinsic::amdgcn_cvt_pk_i16) 6698 Opcode = AMDGPUISD::CVT_PK_I16_I32; 6699 else 6700 Opcode = AMDGPUISD::CVT_PK_U16_U32; 6701 6702 EVT VT = N->getValueType(0); 6703 if (isTypeLegal(VT)) 6704 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1)); 6705 else { 6706 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); 6707 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); 6708 } 6709 return; 6710 } 6711 case Intrinsic::amdgcn_s_buffer_load: { 6712 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate 6713 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG 6714 // combiner tries to merge the s_buffer_load_u8 with a sext instruction 6715 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with 6716 // s_buffer_load_i8. 6717 if (!Subtarget->hasScalarSubwordLoads()) 6718 return; 6719 SDValue Op = SDValue(N, 0); 6720 SDValue Rsrc = Op.getOperand(1); 6721 SDValue Offset = Op.getOperand(2); 6722 SDValue CachePolicy = Op.getOperand(3); 6723 EVT VT = Op.getValueType(); 6724 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n"); 6725 SDLoc DL(Op); 6726 MachineFunction &MF = DAG.getMachineFunction(); 6727 const DataLayout &DataLayout = DAG.getDataLayout(); 6728 Align Alignment = 6729 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); 6730 MachineMemOperand *MMO = MF.getMachineMemOperand( 6731 MachinePointerInfo(), 6732 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6733 MachineMemOperand::MOInvariant, 6734 VT.getStoreSize(), Alignment); 6735 SDValue LoadVal; 6736 if (!Offset->isDivergent()) { 6737 SDValue Ops[] = {Rsrc, // source register 6738 Offset, CachePolicy}; 6739 SDValue BufferLoad = 6740 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL, 6741 DAG.getVTList(MVT::i32), Ops, VT, MMO); 6742 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 6743 } else { 6744 SDValue Ops[] = { 6745 DAG.getEntryNode(), // Chain 6746 Rsrc, // rsrc 6747 DAG.getConstant(0, DL, MVT::i32), // vindex 6748 {}, // voffset 6749 {}, // soffset 6750 {}, // offset 6751 CachePolicy, // cachepolicy 6752 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 6753 }; 6754 setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); 6755 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); 6756 } 6757 Results.push_back(LoadVal); 6758 return; 6759 } 6760 case Intrinsic::amdgcn_dead: { 6761 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I) 6762 Results.push_back(DAG.getPOISON(N->getValueType(I))); 6763 return; 6764 } 6765 } 6766 break; 6767 } 6768 case ISD::INTRINSIC_W_CHAIN: { 6769 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { 6770 if (Res.getOpcode() == ISD::MERGE_VALUES) { 6771 // FIXME: Hacky 6772 for (unsigned I = 0; I < Res.getNumOperands(); I++) { 6773 Results.push_back(Res.getOperand(I)); 6774 } 6775 } else { 6776 Results.push_back(Res); 6777 Results.push_back(Res.getValue(1)); 6778 } 6779 return; 6780 } 6781 6782 break; 6783 } 6784 case ISD::SELECT: { 6785 SDLoc SL(N); 6786 EVT VT = N->getValueType(0); 6787 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 6788 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); 6789 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); 6790 6791 EVT SelectVT = NewVT; 6792 if (NewVT.bitsLT(MVT::i32)) { 6793 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); 6794 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); 6795 SelectVT = MVT::i32; 6796 } 6797 6798 SDValue NewSelect = 6799 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS); 6800 6801 if (NewVT != SelectVT) 6802 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); 6803 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); 6804 return; 6805 } 6806 case ISD::FNEG: { 6807 if (N->getValueType(0) != MVT::v2f16) 6808 break; 6809 6810 SDLoc SL(N); 6811 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); 6812 6813 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC, 6814 DAG.getConstant(0x80008000, SL, MVT::i32)); 6815 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); 6816 return; 6817 } 6818 case ISD::FABS: { 6819 if (N->getValueType(0) != MVT::v2f16) 6820 break; 6821 6822 SDLoc SL(N); 6823 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); 6824 6825 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC, 6826 DAG.getConstant(0x7fff7fff, SL, MVT::i32)); 6827 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); 6828 return; 6829 } 6830 case ISD::FSQRT: { 6831 if (N->getValueType(0) != MVT::f16) 6832 break; 6833 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); 6834 break; 6835 } 6836 default: 6837 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 6838 break; 6839 } 6840 } 6841 6842 /// Helper function for LowerBRCOND 6843 static SDNode *findUser(SDValue Value, unsigned Opcode) { 6844 6845 for (SDUse &U : Value->uses()) { 6846 if (U.get() != Value) 6847 continue; 6848 6849 if (U.getUser()->getOpcode() == Opcode) 6850 return U.getUser(); 6851 } 6852 return nullptr; 6853 } 6854 6855 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 6856 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 6857 switch (Intr->getConstantOperandVal(1)) { 6858 case Intrinsic::amdgcn_if: 6859 return AMDGPUISD::IF; 6860 case Intrinsic::amdgcn_else: 6861 return AMDGPUISD::ELSE; 6862 case Intrinsic::amdgcn_loop: 6863 return AMDGPUISD::LOOP; 6864 case Intrinsic::amdgcn_end_cf: 6865 llvm_unreachable("should not occur"); 6866 default: 6867 return 0; 6868 } 6869 } 6870 6871 // break, if_break, else_break are all only used as inputs to loop, not 6872 // directly as branch conditions. 6873 return 0; 6874 } 6875 6876 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { 6877 const Triple &TT = getTargetMachine().getTargetTriple(); 6878 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 6879 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 6880 AMDGPU::shouldEmitConstantsToTextSection(TT); 6881 } 6882 6883 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { 6884 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) 6885 return false; 6886 6887 // FIXME: Either avoid relying on address space here or change the default 6888 // address space for functions to avoid the explicit check. 6889 return (GV->getValueType()->isFunctionTy() || 6890 !isNonGlobalAddrSpace(GV->getAddressSpace())) && 6891 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV); 6892 } 6893 6894 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { 6895 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); 6896 } 6897 6898 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { 6899 if (!GV->hasExternalLinkage()) 6900 return true; 6901 6902 const auto OS = getTargetMachine().getTargetTriple().getOS(); 6903 return OS == Triple::AMDHSA || OS == Triple::AMDPAL; 6904 } 6905 6906 /// This transforms the control flow intrinsics to get the branch destination as 6907 /// last parameter, also switches branch target with BR if the need arise 6908 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { 6909 SDLoc DL(BRCOND); 6910 6911 SDNode *Intr = BRCOND.getOperand(1).getNode(); 6912 SDValue Target = BRCOND.getOperand(2); 6913 SDNode *BR = nullptr; 6914 SDNode *SetCC = nullptr; 6915 6916 if (Intr->getOpcode() == ISD::SETCC) { 6917 // As long as we negate the condition everything is fine 6918 SetCC = Intr; 6919 Intr = SetCC->getOperand(0).getNode(); 6920 6921 } else { 6922 // Get the target from BR if we don't negate the condition 6923 BR = findUser(BRCOND, ISD::BR); 6924 assert(BR && "brcond missing unconditional branch user"); 6925 Target = BR->getOperand(1); 6926 } 6927 6928 unsigned CFNode = isCFIntrinsic(Intr); 6929 if (CFNode == 0) { 6930 // This is a uniform branch so we don't need to legalize. 6931 return BRCOND; 6932 } 6933 6934 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 6935 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 6936 6937 assert(!SetCC || 6938 (SetCC->getConstantOperandVal(1) == 1 && 6939 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 6940 ISD::SETNE)); 6941 6942 // operands of the new intrinsic call 6943 SmallVector<SDValue, 4> Ops; 6944 if (HaveChain) 6945 Ops.push_back(BRCOND.getOperand(0)); 6946 6947 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); 6948 Ops.push_back(Target); 6949 6950 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 6951 6952 // build the new intrinsic call 6953 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); 6954 6955 if (!HaveChain) { 6956 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)}; 6957 6958 Result = DAG.getMergeValues(Ops, DL).getNode(); 6959 } 6960 6961 if (BR) { 6962 // Give the branch instruction our target 6963 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)}; 6964 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 6965 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 6966 } 6967 6968 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 6969 6970 // Copy the intrinsic results to registers 6971 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 6972 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 6973 if (!CopyToReg) 6974 continue; 6975 6976 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1), 6977 SDValue(Result, i - 1), SDValue()); 6978 6979 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 6980 } 6981 6982 // Remove the old intrinsic from the chain 6983 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1), 6984 Intr->getOperand(0)); 6985 6986 return Chain; 6987 } 6988 6989 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { 6990 MVT VT = Op.getSimpleValueType(); 6991 SDLoc DL(Op); 6992 // Checking the depth 6993 if (Op.getConstantOperandVal(0) != 0) 6994 return DAG.getConstant(0, DL, VT); 6995 6996 MachineFunction &MF = DAG.getMachineFunction(); 6997 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 6998 // Check for kernel and shader functions 6999 if (Info->isEntryFunction()) 7000 return DAG.getConstant(0, DL, VT); 7001 7002 MachineFrameInfo &MFI = MF.getFrameInfo(); 7003 // There is a call to @llvm.returnaddress in this function 7004 MFI.setReturnAddressIsTaken(true); 7005 7006 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 7007 // Get the return address reg and mark it as an implicit live-in 7008 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), 7009 getRegClassFor(VT, Op.getNode()->isDivergent())); 7010 7011 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 7012 } 7013 7014 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, 7015 const SDLoc &DL, EVT VT) const { 7016 return Op.getValueType().bitsLE(VT) 7017 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) 7018 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op, 7019 DAG.getTargetConstant(0, DL, MVT::i32)); 7020 } 7021 7022 SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op, 7023 SelectionDAG &DAG) const { 7024 EVT DstVT = Op.getValueType(); 7025 unsigned NumElts = DstVT.getVectorNumElements(); 7026 assert(NumElts > 2 && isPowerOf2_32(NumElts)); 7027 7028 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); 7029 7030 SDLoc DL(Op); 7031 unsigned Opc = Op.getOpcode(); 7032 SDValue Flags = Op.getOperand(1); 7033 EVT HalfDstVT = 7034 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2); 7035 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags); 7036 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags); 7037 7038 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi); 7039 } 7040 7041 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 7042 SDValue Src = Op.getOperand(0); 7043 EVT SrcVT = Src.getValueType(); 7044 EVT DstVT = Op.getValueType(); 7045 7046 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) { 7047 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32"); 7048 if (SrcVT.getScalarType() != MVT::f32) 7049 return SDValue(); 7050 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG); 7051 } 7052 7053 if (SrcVT.getScalarType() != MVT::f64) 7054 return Op; 7055 7056 SDLoc DL(Op); 7057 if (DstVT == MVT::f16) { 7058 // TODO: Handle strictfp 7059 if (Op.getOpcode() != ISD::FP_ROUND) 7060 return Op; 7061 7062 if (!Subtarget->has16BitInsts()) { 7063 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); 7064 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 7065 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); 7066 } 7067 if (getTargetMachine().Options.UnsafeFPMath) { 7068 SDValue Flags = Op.getOperand(1); 7069 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags); 7070 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags); 7071 } 7072 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG); 7073 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 7074 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); 7075 } 7076 7077 assert(DstVT.getScalarType() == MVT::bf16 && 7078 "custom lower FP_ROUND for f16 or bf16"); 7079 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal"); 7080 7081 // Round-inexact-to-odd f64 to f32, then do the final rounding using the 7082 // hardware f32 -> bf16 instruction. 7083 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) : 7084 MVT::f32; 7085 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG); 7086 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod, 7087 DAG.getTargetConstant(0, DL, MVT::i32)); 7088 } 7089 7090 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, 7091 SelectionDAG &DAG) const { 7092 EVT VT = Op.getValueType(); 7093 const MachineFunction &MF = DAG.getMachineFunction(); 7094 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7095 bool IsIEEEMode = Info->getMode().IEEE; 7096 7097 // FIXME: Assert during selection that this is only selected for 7098 // ieee_mode. Currently a combine can produce the ieee version for non-ieee 7099 // mode functions, but this happens to be OK since it's only done in cases 7100 // where there is known no sNaN. 7101 if (IsIEEEMode) 7102 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); 7103 7104 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || 7105 VT == MVT::v16bf16) 7106 return splitBinaryVectorOp(Op, DAG); 7107 return Op; 7108 } 7109 7110 SDValue 7111 SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op, 7112 SelectionDAG &DAG) const { 7113 EVT VT = Op.getValueType(); 7114 const MachineFunction &MF = DAG.getMachineFunction(); 7115 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7116 bool IsIEEEMode = Info->getMode().IEEE; 7117 7118 if (IsIEEEMode) 7119 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG); 7120 7121 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || 7122 VT == MVT::v16bf16) 7123 return splitBinaryVectorOp(Op, DAG); 7124 return Op; 7125 } 7126 7127 SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op, 7128 SelectionDAG &DAG) const { 7129 EVT VT = Op.getValueType(); 7130 if (VT.isVector()) 7131 return splitBinaryVectorOp(Op, DAG); 7132 7133 assert(!Subtarget->hasIEEEMinimumMaximumInsts() && 7134 !Subtarget->hasMinimum3Maximum3F16() && 7135 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 && 7136 "should not need to widen f16 minimum/maximum to v2f16"); 7137 7138 // Widen f16 operation to v2f16 7139 7140 // fminimum f16:x, f16:y -> 7141 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x)) 7142 // (v2f16 (scalar_to_vector y))), 0 7143 SDLoc SL(Op); 7144 SDValue WideSrc0 = 7145 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0)); 7146 SDValue WideSrc1 = 7147 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1)); 7148 7149 SDValue Widened = 7150 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1); 7151 7152 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened, 7153 DAG.getConstant(0, SL, MVT::i32)); 7154 } 7155 7156 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { 7157 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; 7158 EVT VT = Op.getValueType(); 7159 assert(VT == MVT::f16); 7160 7161 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1); 7162 EVT ExpVT = Exp.getValueType(); 7163 if (ExpVT == MVT::i16) 7164 return Op; 7165 7166 SDLoc DL(Op); 7167 7168 // Correct the exponent type for f16 to i16. 7169 // Clamp the range of the exponent to the instruction's range. 7170 7171 // TODO: This should be a generic narrowing legalization, and can easily be 7172 // for GlobalISel. 7173 7174 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT); 7175 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp); 7176 7177 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT); 7178 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp); 7179 7180 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); 7181 7182 if (IsStrict) { 7183 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other}, 7184 {Op.getOperand(0), Op.getOperand(1), TruncExp}); 7185 } 7186 7187 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); 7188 } 7189 7190 static unsigned getExtOpcodeForPromotedOp(SDValue Op) { 7191 switch (Op->getOpcode()) { 7192 case ISD::SRA: 7193 case ISD::SMIN: 7194 case ISD::SMAX: 7195 return ISD::SIGN_EXTEND; 7196 case ISD::SRL: 7197 case ISD::UMIN: 7198 case ISD::UMAX: 7199 return ISD::ZERO_EXTEND; 7200 case ISD::ADD: 7201 case ISD::SUB: 7202 case ISD::AND: 7203 case ISD::OR: 7204 case ISD::XOR: 7205 case ISD::SHL: 7206 case ISD::SELECT: 7207 case ISD::MUL: 7208 // operation result won't be influenced by garbage high bits. 7209 // TODO: are all of those cases correct, and are there more? 7210 return ISD::ANY_EXTEND; 7211 case ISD::SETCC: { 7212 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7213 return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 7214 } 7215 default: 7216 llvm_unreachable("unexpected opcode!"); 7217 } 7218 } 7219 7220 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, 7221 DAGCombinerInfo &DCI) const { 7222 const unsigned Opc = Op.getOpcode(); 7223 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL || 7224 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND || 7225 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL || 7226 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN || 7227 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX); 7228 7229 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() 7230 : Op->getOperand(0).getValueType(); 7231 auto ExtTy = OpTy.changeElementType(MVT::i32); 7232 7233 if (DCI.isBeforeLegalizeOps() || 7234 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy)) 7235 return SDValue(); 7236 7237 auto &DAG = DCI.DAG; 7238 7239 SDLoc DL(Op); 7240 SDValue LHS; 7241 SDValue RHS; 7242 if (Opc == ISD::SELECT) { 7243 LHS = Op->getOperand(1); 7244 RHS = Op->getOperand(2); 7245 } else { 7246 LHS = Op->getOperand(0); 7247 RHS = Op->getOperand(1); 7248 } 7249 7250 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op); 7251 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS}); 7252 7253 // Special case: for shifts, the RHS always needs a zext. 7254 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 7255 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS}); 7256 else 7257 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS}); 7258 7259 // setcc always return i1/i1 vec so no need to truncate after. 7260 if (Opc == ISD::SETCC) { 7261 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7262 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC); 7263 } 7264 7265 // For other ops, we extend the operation's return type as well so we need to 7266 // truncate back to the original type. 7267 SDValue NewVal; 7268 if (Opc == ISD::SELECT) 7269 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS}); 7270 else 7271 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}); 7272 7273 return DAG.getZExtOrTrunc(NewVal, DL, OpTy); 7274 } 7275 7276 SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7277 SDValue Mag = Op.getOperand(0); 7278 EVT MagVT = Mag.getValueType(); 7279 7280 if (MagVT.getVectorNumElements() > 2) 7281 return splitBinaryVectorOp(Op, DAG); 7282 7283 SDValue Sign = Op.getOperand(1); 7284 EVT SignVT = Sign.getValueType(); 7285 7286 if (MagVT == SignVT) 7287 return Op; 7288 7289 // fcopysign v2f16:mag, v2f32:sign -> 7290 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16) 7291 7292 SDLoc SL(Op); 7293 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign); 7294 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32); 7295 7296 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16); 7297 7298 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16); 7299 } 7300 7301 // Custom lowering for vector multiplications and s_mul_u64. 7302 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { 7303 EVT VT = Op.getValueType(); 7304 7305 // Split vector operands. 7306 if (VT.isVector()) 7307 return splitBinaryVectorOp(Op, DAG); 7308 7309 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64"); 7310 7311 // There are four ways to lower s_mul_u64: 7312 // 7313 // 1. If all the operands are uniform, then we lower it as it is. 7314 // 7315 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit 7316 // multiplications because there is not a vector equivalent of s_mul_u64. 7317 // 7318 // 3. If the cost model decides that it is more efficient to use vector 7319 // registers, then we have to split s_mul_u64 in 32-bit multiplications. 7320 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . 7321 // 7322 // 4. If the cost model decides to use vector registers and both of the 7323 // operands are zero-extended/sign-extended from 32-bits, then we split the 7324 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not 7325 // possible to check if the operands are zero-extended or sign-extended in 7326 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with 7327 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace 7328 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. 7329 // If the cost model decides that we have to use vector registers, then 7330 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ 7331 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model 7332 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ 7333 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in 7334 // SIInstrInfo.cpp . 7335 7336 if (Op->isDivergent()) 7337 return SDValue(); 7338 7339 SDValue Op0 = Op.getOperand(0); 7340 SDValue Op1 = Op.getOperand(1); 7341 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 7342 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to 7343 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. 7344 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); 7345 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); 7346 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1); 7347 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); 7348 SDLoc SL(Op); 7349 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) 7350 return SDValue( 7351 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); 7352 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0); 7353 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1); 7354 if (Op0SignBits >= 33 && Op1SignBits >= 33) 7355 return SDValue( 7356 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); 7357 // If all the operands are uniform, then we lower s_mul_u64 as it is. 7358 return Op; 7359 } 7360 7361 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { 7362 EVT VT = Op.getValueType(); 7363 SDLoc SL(Op); 7364 SDValue LHS = Op.getOperand(0); 7365 SDValue RHS = Op.getOperand(1); 7366 bool isSigned = Op.getOpcode() == ISD::SMULO; 7367 7368 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) { 7369 const APInt &C = RHSC->getAPIntValue(); 7370 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } 7371 if (C.isPowerOf2()) { 7372 // smulo(x, signed_min) is same as umulo(x, signed_min). 7373 bool UseArithShift = isSigned && !C.isMinSignedValue(); 7374 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32); 7375 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt); 7376 SDValue Overflow = 7377 DAG.getSetCC(SL, MVT::i1, 7378 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT, 7379 Result, ShiftAmt), 7380 LHS, ISD::SETNE); 7381 return DAG.getMergeValues({Result, Overflow}, SL); 7382 } 7383 } 7384 7385 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS); 7386 SDValue Top = 7387 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS); 7388 7389 SDValue Sign = isSigned 7390 ? DAG.getNode(ISD::SRA, SL, VT, Result, 7391 DAG.getConstant(VT.getScalarSizeInBits() - 1, 7392 SL, MVT::i32)) 7393 : DAG.getConstant(0, SL, VT); 7394 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); 7395 7396 return DAG.getMergeValues({Result, Overflow}, SL); 7397 } 7398 7399 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { 7400 if (Op->isDivergent()) { 7401 // Select to V_MAD_[IU]64_[IU]32. 7402 return Op; 7403 } 7404 if (Subtarget->hasSMulHi()) { 7405 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32. 7406 return SDValue(); 7407 } 7408 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to 7409 // calculate the high part, so we might as well do the whole thing with 7410 // V_MAD_[IU]64_[IU]32. 7411 return Op; 7412 } 7413 7414 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { 7415 if (!Subtarget->isTrapHandlerEnabled() || 7416 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 7417 return lowerTrapEndpgm(Op, DAG); 7418 7419 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) 7420 : lowerTrapHsaQueuePtr(Op, DAG); 7421 } 7422 7423 SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const { 7424 SDLoc SL(Op); 7425 SDValue Chain = Op.getOperand(0); 7426 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain); 7427 } 7428 7429 SDValue 7430 SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, 7431 const SDLoc &DL, Align Alignment, 7432 ImplicitParameter Param) const { 7433 MachineFunction &MF = DAG.getMachineFunction(); 7434 uint64_t Offset = getImplicitParameterOffset(MF, Param); 7435 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); 7436 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 7437 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, 7438 MachineMemOperand::MODereferenceable | 7439 MachineMemOperand::MOInvariant); 7440 } 7441 7442 SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op, 7443 SelectionDAG &DAG) const { 7444 SDLoc SL(Op); 7445 SDValue Chain = Op.getOperand(0); 7446 7447 SDValue QueuePtr; 7448 // For code object version 5, QueuePtr is passed through implicit kernarg. 7449 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 7450 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { 7451 QueuePtr = 7452 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); 7453 } else { 7454 MachineFunction &MF = DAG.getMachineFunction(); 7455 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7456 Register UserSGPR = Info->getQueuePtrUserSGPR(); 7457 7458 if (UserSGPR == AMDGPU::NoRegister) { 7459 // We probably are in a function incorrectly marked with 7460 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the 7461 // trap, so just use a null pointer. 7462 QueuePtr = DAG.getConstant(0, SL, MVT::i64); 7463 } else { 7464 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, 7465 MVT::i64); 7466 } 7467 } 7468 7469 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); 7470 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue()); 7471 7472 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); 7473 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01, 7474 ToReg.getValue(1)}; 7475 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 7476 } 7477 7478 SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const { 7479 SDLoc SL(Op); 7480 SDValue Chain = Op.getOperand(0); 7481 7482 // We need to simulate the 's_trap 2' instruction on targets that run in 7483 // PRIV=1 (where it is treated as a nop). 7484 if (Subtarget->hasPrivEnabledTrap2NopBug()) 7485 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); 7486 7487 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); 7488 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)}; 7489 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 7490 } 7491 7492 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { 7493 SDLoc SL(Op); 7494 SDValue Chain = Op.getOperand(0); 7495 MachineFunction &MF = DAG.getMachineFunction(); 7496 7497 if (!Subtarget->isTrapHandlerEnabled() || 7498 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 7499 LLVMContext &Ctx = MF.getFunction().getContext(); 7500 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(), 7501 "debugtrap handler not supported", 7502 Op.getDebugLoc(), DS_Warning)); 7503 return Chain; 7504 } 7505 7506 uint64_t TrapID = 7507 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); 7508 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)}; 7509 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 7510 } 7511 7512 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, 7513 SelectionDAG &DAG) const { 7514 if (Subtarget->hasApertureRegs()) { 7515 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 7516 ? AMDGPU::SRC_SHARED_BASE 7517 : AMDGPU::SRC_PRIVATE_BASE; 7518 // Note: this feature (register) is broken. When used as a 32-bit operand, 7519 // it returns a wrong value (all zeroes?). The real value is in the upper 32 7520 // bits. 7521 // 7522 // To work around the issue, directly emit a 64 bit mov from this register 7523 // then extract the high bits. Note that this shouldn't even result in a 7524 // shift being emitted and simply become a pair of registers (e.g.): 7525 // s_mov_b64 s[6:7], src_shared_base 7526 // v_mov_b32_e32 v1, s7 7527 // 7528 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy 7529 // coalescing would kick in and it would think it's okay to use the "HI" 7530 // subregister directly (instead of extracting the HI 32 bits) which is an 7531 // artificial (unusable) register. 7532 // Register TableGen definitions would need an overhaul to get rid of the 7533 // artificial "HI" aperture registers and prevent this kind of issue from 7534 // happening. 7535 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, 7536 DAG.getRegister(ApertureRegNo, MVT::i64)); 7537 return DAG.getNode( 7538 ISD::TRUNCATE, DL, MVT::i32, 7539 DAG.getNode(ISD::SRL, DL, MVT::i64, 7540 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); 7541 } 7542 7543 // For code object version 5, private_base and shared_base are passed through 7544 // implicit kernargs. 7545 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 7546 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { 7547 ImplicitParameter Param = 7548 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; 7549 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); 7550 } 7551 7552 MachineFunction &MF = DAG.getMachineFunction(); 7553 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7554 Register UserSGPR = Info->getQueuePtrUserSGPR(); 7555 if (UserSGPR == AMDGPU::NoRegister) { 7556 // We probably are in a function incorrectly marked with 7557 // amdgpu-no-queue-ptr. This is undefined. 7558 return DAG.getPOISON(MVT::i32); 7559 } 7560 7561 SDValue QueuePtr = 7562 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 7563 7564 // Offset into amd_queue_t for group_segment_aperture_base_hi / 7565 // private_segment_aperture_base_hi. 7566 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 7567 7568 SDValue Ptr = 7569 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset)); 7570 7571 // TODO: Use custom target PseudoSourceValue. 7572 // TODO: We should use the value from the IR intrinsic call, but it might not 7573 // be available and how do we get it? 7574 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 7575 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, 7576 commonAlignment(Align(64), StructOffset), 7577 MachineMemOperand::MODereferenceable | 7578 MachineMemOperand::MOInvariant); 7579 } 7580 7581 /// Return true if the value is a known valid address, such that a null check is 7582 /// not necessary. 7583 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, 7584 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 7585 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val)) 7586 return true; 7587 7588 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) 7589 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); 7590 7591 // TODO: Search through arithmetic, handle arguments and loads 7592 // marked nonnull. 7593 return false; 7594 } 7595 7596 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 7597 SelectionDAG &DAG) const { 7598 SDLoc SL(Op); 7599 7600 const AMDGPUTargetMachine &TM = 7601 static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); 7602 7603 unsigned DestAS, SrcAS; 7604 SDValue Src; 7605 bool IsNonNull = false; 7606 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) { 7607 SrcAS = ASC->getSrcAddressSpace(); 7608 Src = ASC->getOperand(0); 7609 DestAS = ASC->getDestAddressSpace(); 7610 } else { 7611 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 7612 Op.getConstantOperandVal(0) == 7613 Intrinsic::amdgcn_addrspacecast_nonnull); 7614 Src = Op->getOperand(1); 7615 SrcAS = Op->getConstantOperandVal(2); 7616 DestAS = Op->getConstantOperandVal(3); 7617 IsNonNull = true; 7618 } 7619 7620 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 7621 7622 // flat -> local/private 7623 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 7624 if (DestAS == AMDGPUAS::LOCAL_ADDRESS || 7625 DestAS == AMDGPUAS::PRIVATE_ADDRESS) { 7626 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 7627 7628 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) 7629 return Ptr; 7630 7631 unsigned NullVal = TM.getNullPointerValue(DestAS); 7632 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 7633 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 7634 7635 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, 7636 SegmentNullPtr); 7637 } 7638 } 7639 7640 // local/private -> flat 7641 if (DestAS == AMDGPUAS::FLAT_ADDRESS) { 7642 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 7643 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { 7644 7645 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); 7646 SDValue CvtPtr = 7647 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 7648 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); 7649 7650 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) 7651 return CvtPtr; 7652 7653 unsigned NullVal = TM.getNullPointerValue(SrcAS); 7654 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 7655 7656 SDValue NonNull = 7657 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 7658 7659 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, 7660 FlatNullPtr); 7661 } 7662 } 7663 7664 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 7665 Op.getValueType() == MVT::i64) { 7666 const SIMachineFunctionInfo *Info = 7667 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); 7668 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); 7669 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); 7670 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 7671 } 7672 7673 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 7674 Src.getValueType() == MVT::i64) 7675 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 7676 7677 // global <-> flat are no-ops and never emitted. 7678 7679 // Invalid casts are poison. 7680 return DAG.getPOISON(Op->getValueType(0)); 7681 } 7682 7683 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from 7684 // the small vector and inserting them into the big vector. That is better than 7685 // the default expansion of doing it via a stack slot. Even though the use of 7686 // the stack slot would be optimized away afterwards, the stack slot itself 7687 // remains. 7688 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, 7689 SelectionDAG &DAG) const { 7690 SDValue Vec = Op.getOperand(0); 7691 SDValue Ins = Op.getOperand(1); 7692 SDValue Idx = Op.getOperand(2); 7693 EVT VecVT = Vec.getValueType(); 7694 EVT InsVT = Ins.getValueType(); 7695 EVT EltVT = VecVT.getVectorElementType(); 7696 unsigned InsNumElts = InsVT.getVectorNumElements(); 7697 unsigned IdxVal = Idx->getAsZExtVal(); 7698 SDLoc SL(Op); 7699 7700 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { 7701 // Insert 32-bit registers at a time. 7702 assert(InsNumElts % 2 == 0 && "expect legal vector types"); 7703 7704 unsigned VecNumElts = VecVT.getVectorNumElements(); 7705 EVT NewVecVT = 7706 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); 7707 EVT NewInsVT = InsNumElts == 2 ? MVT::i32 7708 : EVT::getVectorVT(*DAG.getContext(), 7709 MVT::i32, InsNumElts / 2); 7710 7711 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec); 7712 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins); 7713 7714 for (unsigned I = 0; I != InsNumElts / 2; ++I) { 7715 SDValue Elt; 7716 if (InsNumElts == 2) { 7717 Elt = Ins; 7718 } else { 7719 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, 7720 DAG.getConstant(I, SL, MVT::i32)); 7721 } 7722 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, 7723 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); 7724 } 7725 7726 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec); 7727 } 7728 7729 for (unsigned I = 0; I != InsNumElts; ++I) { 7730 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, 7731 DAG.getConstant(I, SL, MVT::i32)); 7732 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, 7733 DAG.getConstant(IdxVal + I, SL, MVT::i32)); 7734 } 7735 return Vec; 7736 } 7737 7738 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, 7739 SelectionDAG &DAG) const { 7740 SDValue Vec = Op.getOperand(0); 7741 SDValue InsVal = Op.getOperand(1); 7742 SDValue Idx = Op.getOperand(2); 7743 EVT VecVT = Vec.getValueType(); 7744 EVT EltVT = VecVT.getVectorElementType(); 7745 unsigned VecSize = VecVT.getSizeInBits(); 7746 unsigned EltSize = EltVT.getSizeInBits(); 7747 SDLoc SL(Op); 7748 7749 // Specially handle the case of v4i16 with static indexing. 7750 unsigned NumElts = VecVT.getVectorNumElements(); 7751 auto *KIdx = dyn_cast<ConstantSDNode>(Idx); 7752 if (NumElts == 4 && EltSize == 16 && KIdx) { 7753 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); 7754 7755 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, 7756 DAG.getConstant(0, SL, MVT::i32)); 7757 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, 7758 DAG.getConstant(1, SL, MVT::i32)); 7759 7760 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); 7761 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); 7762 7763 unsigned Idx = KIdx->getZExtValue(); 7764 bool InsertLo = Idx < 2; 7765 SDValue InsHalf = DAG.getNode( 7766 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec, 7767 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), 7768 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); 7769 7770 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); 7771 7772 SDValue Concat = 7773 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf}) 7774 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf}); 7775 7776 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); 7777 } 7778 7779 // Static indexing does not lower to stack access, and hence there is no need 7780 // for special custom lowering to avoid stack access. 7781 if (isa<ConstantSDNode>(Idx)) 7782 return SDValue(); 7783 7784 // Avoid stack access for dynamic indexing by custom lowering to 7785 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec 7786 7787 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits"); 7788 7789 MVT IntVT = MVT::getIntegerVT(VecSize); 7790 7791 // Convert vector index to bit-index and get the required bit mask. 7792 assert(isPowerOf2_32(EltSize)); 7793 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize); 7794 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); 7795 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); 7796 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, 7797 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx); 7798 7799 // 1. Create a congruent vector with the target value in each element. 7800 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, 7801 DAG.getSplatBuildVector(VecVT, SL, InsVal)); 7802 7803 // 2. Mask off all other indices except the required index within (1). 7804 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); 7805 7806 // 3. Mask off the required index within the target vector. 7807 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); 7808 SDValue RHS = 7809 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec); 7810 7811 // 4. Get (2) and (3) ORed into the target vector. 7812 SDValue BFI = 7813 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint); 7814 7815 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); 7816 } 7817 7818 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, 7819 SelectionDAG &DAG) const { 7820 SDLoc SL(Op); 7821 7822 EVT ResultVT = Op.getValueType(); 7823 SDValue Vec = Op.getOperand(0); 7824 SDValue Idx = Op.getOperand(1); 7825 EVT VecVT = Vec.getValueType(); 7826 unsigned VecSize = VecVT.getSizeInBits(); 7827 EVT EltVT = VecVT.getVectorElementType(); 7828 7829 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 7830 7831 // Make sure we do any optimizations that will make it easier to fold 7832 // source modifiers before obscuring it with bit operations. 7833 7834 // XXX - Why doesn't this get called when vector_shuffle is expanded? 7835 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) 7836 return Combined; 7837 7838 if (VecSize == 128 || VecSize == 256 || VecSize == 512) { 7839 SDValue Lo, Hi; 7840 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT); 7841 7842 if (VecSize == 128) { 7843 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); 7844 Lo = DAG.getBitcast(LoVT, 7845 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7846 DAG.getConstant(0, SL, MVT::i32))); 7847 Hi = DAG.getBitcast(HiVT, 7848 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7849 DAG.getConstant(1, SL, MVT::i32))); 7850 } else if (VecSize == 256) { 7851 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); 7852 SDValue Parts[4]; 7853 for (unsigned P = 0; P < 4; ++P) { 7854 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7855 DAG.getConstant(P, SL, MVT::i32)); 7856 } 7857 7858 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, 7859 Parts[0], Parts[1])); 7860 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, 7861 Parts[2], Parts[3])); 7862 } else { 7863 assert(VecSize == 512); 7864 7865 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); 7866 SDValue Parts[8]; 7867 for (unsigned P = 0; P < 8; ++P) { 7868 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7869 DAG.getConstant(P, SL, MVT::i32)); 7870 } 7871 7872 Lo = DAG.getBitcast(LoVT, 7873 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, 7874 Parts[0], Parts[1], Parts[2], Parts[3])); 7875 Hi = DAG.getBitcast(HiVT, 7876 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, 7877 Parts[4], Parts[5], Parts[6], Parts[7])); 7878 } 7879 7880 EVT IdxVT = Idx.getValueType(); 7881 unsigned NElem = VecVT.getVectorNumElements(); 7882 assert(isPowerOf2_32(NElem)); 7883 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); 7884 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask); 7885 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT); 7886 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx); 7887 } 7888 7889 assert(VecSize <= 64); 7890 7891 MVT IntVT = MVT::getIntegerVT(VecSize); 7892 7893 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. 7894 SDValue VecBC = peekThroughBitcasts(Vec); 7895 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7896 SDValue Src = VecBC.getOperand(0); 7897 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src); 7898 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT); 7899 } 7900 7901 unsigned EltSize = EltVT.getSizeInBits(); 7902 assert(isPowerOf2_32(EltSize)); 7903 7904 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); 7905 7906 // Convert vector index to bit-index (* EltSize) 7907 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); 7908 7909 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); 7910 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); 7911 7912 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) { 7913 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); 7914 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); 7915 } 7916 7917 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); 7918 } 7919 7920 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { 7921 assert(Elt % 2 == 0); 7922 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); 7923 } 7924 7925 static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) { 7926 assert(Elt % 2 == 0); 7927 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) && 7928 !(Mask[Elt + 1] & 1); 7929 } 7930 7931 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, 7932 SelectionDAG &DAG) const { 7933 SDLoc SL(Op); 7934 EVT ResultVT = Op.getValueType(); 7935 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); 7936 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT(); 7937 const int NewSrcNumElts = 2; 7938 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts); 7939 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); 7940 7941 // Break up the shuffle into registers sized pieces. 7942 // 7943 // We're trying to form sub-shuffles that the register allocation pipeline 7944 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register 7945 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a 7946 // pair of copies into a consecutive register copy, so use the ordinary 7947 // extract_vector_elt lowering unless we can use the shuffle. 7948 // 7949 // TODO: This is a bit of hack, and we should probably always use 7950 // extract_subvector for the largest possible subvector we can (or at least 7951 // use it for PackVT aligned pieces). However we have worse support for 7952 // combines on them don't directly treat extract_subvector / insert_subvector 7953 // as legal. The DAG scheduler also ends up doing a worse job with the 7954 // extract_subvectors. 7955 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16; 7956 7957 // vector_shuffle <0,1,6,7> lhs, rhs 7958 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) 7959 // 7960 // vector_shuffle <6,7,2,3> lhs, rhs 7961 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) 7962 // 7963 // vector_shuffle <6,7,0,1> lhs, rhs 7964 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) 7965 7966 // Avoid scalarizing when both halves are reading from consecutive elements. 7967 7968 // If we're treating 2 element shuffles as legal, also create odd-to-even 7969 // shuffles of neighboring pairs. 7970 // 7971 // vector_shuffle <3,2,7,6> lhs, rhs 7972 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0) 7973 // vector_shuffle <1, 0> (extract_subvector rhs, 2) 7974 7975 SmallVector<SDValue, 16> Pieces; 7976 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { 7977 if (ShouldUseConsecutiveExtract && 7978 elementPairIsContiguous(SVN->getMask(), I)) { 7979 const int Idx = SVN->getMaskElt(I); 7980 int VecIdx = Idx < SrcNumElts ? 0 : 1; 7981 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; 7982 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, 7983 SVN->getOperand(VecIdx), 7984 DAG.getConstant(EltIdx, SL, MVT::i32)); 7985 Pieces.push_back(SubVec); 7986 } else if (elementPairIsOddToEven(SVN->getMask(), I) && 7987 isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) { 7988 int Idx0 = SVN->getMaskElt(I); 7989 int Idx1 = SVN->getMaskElt(I + 1); 7990 7991 SDValue SrcOp0 = SVN->getOperand(0); 7992 SDValue SrcOp1 = SrcOp0; 7993 if (Idx0 >= SrcNumElts) { 7994 SrcOp0 = SVN->getOperand(1); 7995 Idx0 -= SrcNumElts; 7996 } 7997 7998 if (Idx1 >= SrcNumElts) { 7999 SrcOp1 = SVN->getOperand(1); 8000 Idx1 -= SrcNumElts; 8001 } 8002 8003 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1); 8004 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1); 8005 8006 // Extract nearest even aligned piece. 8007 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0, 8008 DAG.getConstant(AlignedIdx0, SL, MVT::i32)); 8009 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1, 8010 DAG.getConstant(AlignedIdx1, SL, MVT::i32)); 8011 8012 int NewMaskIdx0 = Idx0 - AlignedIdx0; 8013 int NewMaskIdx1 = Idx1 - AlignedIdx1; 8014 8015 SDValue Result0 = SubVec0; 8016 SDValue Result1 = SubVec0; 8017 8018 if (SubVec0 != SubVec1) { 8019 NewMaskIdx1 += NewSrcNumElts; 8020 Result1 = SubVec1; 8021 } else { 8022 Result1 = DAG.getPOISON(PackVT); 8023 } 8024 8025 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1, 8026 {NewMaskIdx0, NewMaskIdx1}); 8027 Pieces.push_back(Shuf); 8028 } else { 8029 const int Idx0 = SVN->getMaskElt(I); 8030 const int Idx1 = SVN->getMaskElt(I + 1); 8031 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; 8032 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; 8033 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; 8034 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; 8035 8036 SDValue Vec0 = SVN->getOperand(VecIdx0); 8037 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0, 8038 DAG.getSignedConstant(EltIdx0, SL, MVT::i32)); 8039 8040 SDValue Vec1 = SVN->getOperand(VecIdx1); 8041 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1, 8042 DAG.getSignedConstant(EltIdx1, SL, MVT::i32)); 8043 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1})); 8044 } 8045 } 8046 8047 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); 8048 } 8049 8050 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, 8051 SelectionDAG &DAG) const { 8052 SDValue SVal = Op.getOperand(0); 8053 EVT ResultVT = Op.getValueType(); 8054 EVT SValVT = SVal.getValueType(); 8055 SDValue UndefVal = DAG.getPOISON(SValVT); 8056 SDLoc SL(Op); 8057 8058 SmallVector<SDValue, 8> VElts; 8059 VElts.push_back(SVal); 8060 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) 8061 VElts.push_back(UndefVal); 8062 8063 return DAG.getBuildVector(ResultVT, SL, VElts); 8064 } 8065 8066 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, 8067 SelectionDAG &DAG) const { 8068 SDLoc SL(Op); 8069 EVT VT = Op.getValueType(); 8070 8071 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) { 8072 assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); 8073 8074 SDValue Lo = Op.getOperand(0); 8075 SDValue Hi = Op.getOperand(1); 8076 8077 // Avoid adding defined bits with the zero_extend. 8078 if (Hi.isUndef()) { 8079 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); 8080 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); 8081 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); 8082 } 8083 8084 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); 8085 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); 8086 8087 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, 8088 DAG.getConstant(16, SL, MVT::i32)); 8089 if (Lo.isUndef()) 8090 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); 8091 8092 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); 8093 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); 8094 8095 SDValue Or = 8096 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint); 8097 return DAG.getNode(ISD::BITCAST, SL, VT, Or); 8098 } 8099 8100 // Split into 2-element chunks. 8101 const unsigned NumParts = VT.getVectorNumElements() / 2; 8102 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); 8103 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits()); 8104 8105 SmallVector<SDValue> Casts; 8106 for (unsigned P = 0; P < NumParts; ++P) { 8107 SDValue Vec = DAG.getBuildVector( 8108 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)}); 8109 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec)); 8110 } 8111 8112 SDValue Blend = 8113 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts); 8114 return DAG.getNode(ISD::BITCAST, SL, VT, Blend); 8115 } 8116 8117 bool SITargetLowering::isOffsetFoldingLegal( 8118 const GlobalAddressSDNode *GA) const { 8119 // OSes that use ELF REL relocations (instead of RELA) can only store a 8120 // 32-bit addend in the instruction, so it is not safe to allow offset folding 8121 // which can create arbitrary 64-bit addends. (This is only a problem for 8122 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by 8123 // the high 32 bits of the addend.) 8124 // 8125 // This should be kept in sync with how HasRelocationAddend is initialized in 8126 // the constructor of ELFAMDGPUAsmBackend. 8127 if (!Subtarget->isAmdHsaOS()) 8128 return false; 8129 8130 // We can fold offsets for anything that doesn't require a GOT relocation. 8131 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 8132 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 8133 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 8134 !shouldEmitGOTReloc(GA->getGlobal()); 8135 } 8136 8137 static SDValue 8138 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 8139 const SDLoc &DL, int64_t Offset, EVT PtrVT, 8140 unsigned GAFlags = SIInstrInfo::MO_NONE) { 8141 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 8142 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 8143 // lowered to the following code sequence: 8144 // 8145 // For constant address space: 8146 // s_getpc_b64 s[0:1] 8147 // s_add_u32 s0, s0, $symbol 8148 // s_addc_u32 s1, s1, 0 8149 // 8150 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 8151 // a fixup or relocation is emitted to replace $symbol with a literal 8152 // constant, which is a pc-relative offset from the encoding of the $symbol 8153 // operand to the global variable. 8154 // 8155 // For global address space: 8156 // s_getpc_b64 s[0:1] 8157 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 8158 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 8159 // 8160 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 8161 // fixups or relocations are emitted to replace $symbol@*@lo and 8162 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 8163 // which is a 64-bit pc-relative offset from the encoding of the $symbol 8164 // operand to the global variable. 8165 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); 8166 SDValue PtrHi; 8167 if (GAFlags == SIInstrInfo::MO_NONE) 8168 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); 8169 else 8170 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1); 8171 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 8172 } 8173 8174 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 8175 SDValue Op, 8176 SelectionDAG &DAG) const { 8177 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 8178 SDLoc DL(GSD); 8179 EVT PtrVT = Op.getValueType(); 8180 8181 const GlobalValue *GV = GSD->getGlobal(); 8182 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 8183 shouldUseLDSConstAddress(GV)) || 8184 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || 8185 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 8186 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 8187 GV->hasExternalLinkage()) { 8188 Type *Ty = GV->getValueType(); 8189 // HIP uses an unsized array `extern __shared__ T s[]` or similar 8190 // zero-sized type in other languages to declare the dynamic shared 8191 // memory which size is not known at the compile time. They will be 8192 // allocated by the runtime and placed directly after the static 8193 // allocated ones. They all share the same offset. 8194 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { 8195 assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); 8196 // Adjust alignment for that dynamic shared memory array. 8197 Function &F = DAG.getMachineFunction().getFunction(); 8198 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); 8199 MFI->setUsesDynamicLDS(true); 8200 return SDValue( 8201 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); 8202 } 8203 } 8204 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 8205 } 8206 8207 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 8208 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), 8209 SIInstrInfo::MO_ABS32_LO); 8210 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); 8211 } 8212 8213 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { 8214 SDValue AddrLo = DAG.getTargetGlobalAddress( 8215 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); 8216 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; 8217 8218 SDValue AddrHi = DAG.getTargetGlobalAddress( 8219 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); 8220 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; 8221 8222 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); 8223 } 8224 8225 if (shouldEmitFixup(GV)) 8226 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 8227 8228 if (shouldEmitPCReloc(GV)) 8229 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 8230 SIInstrInfo::MO_REL32); 8231 8232 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 8233 SIInstrInfo::MO_GOTPCREL32); 8234 PointerType *PtrTy = 8235 PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS); 8236 const DataLayout &DataLayout = DAG.getDataLayout(); 8237 Align Alignment = DataLayout.getABITypeAlign(PtrTy); 8238 MachinePointerInfo PtrInfo = 8239 MachinePointerInfo::getGOT(DAG.getMachineFunction()); 8240 8241 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment, 8242 MachineMemOperand::MODereferenceable | 8243 MachineMemOperand::MOInvariant); 8244 } 8245 8246 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 8247 const SDLoc &DL, SDValue V) const { 8248 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 8249 // the destination register. 8250 // 8251 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 8252 // so we will end up with redundant moves to m0. 8253 // 8254 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 8255 8256 // A Null SDValue creates a glue result. 8257 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 8258 V, Chain); 8259 return SDValue(M0, 0); 8260 } 8261 8262 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, 8263 MVT VT, 8264 unsigned Offset) const { 8265 SDLoc SL(Op); 8266 SDValue Param = lowerKernargMemParameter( 8267 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); 8268 // The local size values will have the hi 16-bits as zero. 8269 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 8270 DAG.getValueType(VT)); 8271 } 8272 8273 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 8274 EVT VT) { 8275 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 8276 DAG.getMachineFunction().getFunction(), 8277 "non-hsa intrinsic with hsa target", DL.getDebugLoc())); 8278 return DAG.getPOISON(VT); 8279 } 8280 8281 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 8282 EVT VT) { 8283 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 8284 DAG.getMachineFunction().getFunction(), 8285 "intrinsic not supported on subtarget", DL.getDebugLoc())); 8286 return DAG.getPOISON(VT); 8287 } 8288 8289 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, 8290 ArrayRef<SDValue> Elts) { 8291 assert(!Elts.empty()); 8292 MVT Type; 8293 unsigned NumElts = Elts.size(); 8294 8295 if (NumElts <= 12) { 8296 Type = MVT::getVectorVT(MVT::f32, NumElts); 8297 } else { 8298 assert(Elts.size() <= 16); 8299 Type = MVT::v16f32; 8300 NumElts = 16; 8301 } 8302 8303 SmallVector<SDValue, 16> VecElts(NumElts); 8304 for (unsigned i = 0; i < Elts.size(); ++i) { 8305 SDValue Elt = Elts[i]; 8306 if (Elt.getValueType() != MVT::f32) 8307 Elt = DAG.getBitcast(MVT::f32, Elt); 8308 VecElts[i] = Elt; 8309 } 8310 for (unsigned i = Elts.size(); i < NumElts; ++i) 8311 VecElts[i] = DAG.getPOISON(MVT::f32); 8312 8313 if (NumElts == 1) 8314 return VecElts[0]; 8315 return DAG.getBuildVector(Type, DL, VecElts); 8316 } 8317 8318 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, 8319 SDValue Src, int ExtraElts) { 8320 EVT SrcVT = Src.getValueType(); 8321 8322 SmallVector<SDValue, 8> Elts; 8323 8324 if (SrcVT.isVector()) 8325 DAG.ExtractVectorElements(Src, Elts); 8326 else 8327 Elts.push_back(Src); 8328 8329 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType()); 8330 while (ExtraElts--) 8331 Elts.push_back(Undef); 8332 8333 return DAG.getBuildVector(CastVT, DL, Elts); 8334 } 8335 8336 // Re-construct the required return value for a image load intrinsic. 8337 // This is more complicated due to the optional use TexFailCtrl which means the 8338 // required return type is an aggregate 8339 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, 8340 ArrayRef<EVT> ResultTypes, bool IsTexFail, 8341 bool Unpacked, bool IsD16, int DMaskPop, 8342 int NumVDataDwords, bool IsAtomicPacked16Bit, 8343 const SDLoc &DL) { 8344 // Determine the required return type. This is the same regardless of 8345 // IsTexFail flag 8346 EVT ReqRetVT = ResultTypes[0]; 8347 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; 8348 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) 8349 ? (ReqRetNumElts + 1) / 2 8350 : ReqRetNumElts; 8351 8352 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2; 8353 8354 MVT DataDwordVT = 8355 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); 8356 8357 MVT MaskPopVT = 8358 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); 8359 8360 SDValue Data(Result, 0); 8361 SDValue TexFail; 8362 8363 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { 8364 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); 8365 if (MaskPopVT.isVector()) { 8366 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, 8367 SDValue(Result, 0), ZeroIdx); 8368 } else { 8369 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, 8370 SDValue(Result, 0), ZeroIdx); 8371 } 8372 } 8373 8374 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) 8375 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, 8376 NumDataDwords - MaskPopDwords); 8377 8378 if (IsD16) 8379 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); 8380 8381 EVT LegalReqRetVT = ReqRetVT; 8382 if (!ReqRetVT.isVector()) { 8383 if (!Data.getValueType().isInteger()) 8384 Data = DAG.getNode(ISD::BITCAST, DL, 8385 Data.getValueType().changeTypeToInteger(), Data); 8386 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); 8387 } else { 8388 // We need to widen the return vector to a legal type 8389 if ((ReqRetVT.getVectorNumElements() % 2) == 1 && 8390 ReqRetVT.getVectorElementType().getSizeInBits() == 16) { 8391 LegalReqRetVT = 8392 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(), 8393 ReqRetVT.getVectorNumElements() + 1); 8394 } 8395 } 8396 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data); 8397 8398 if (IsTexFail) { 8399 TexFail = 8400 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0), 8401 DAG.getConstant(MaskPopDwords, DL, MVT::i32)); 8402 8403 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); 8404 } 8405 8406 if (Result->getNumValues() == 1) 8407 return Data; 8408 8409 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL); 8410 } 8411 8412 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, 8413 SDValue *LWE, bool &IsTexFail) { 8414 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode()); 8415 8416 uint64_t Value = TexFailCtrlConst->getZExtValue(); 8417 if (Value) { 8418 IsTexFail = true; 8419 } 8420 8421 SDLoc DL(TexFailCtrlConst); 8422 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); 8423 Value &= ~(uint64_t)0x1; 8424 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); 8425 Value &= ~(uint64_t)0x2; 8426 8427 return Value == 0; 8428 } 8429 8430 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, 8431 MVT PackVectorVT, 8432 SmallVectorImpl<SDValue> &PackedAddrs, 8433 unsigned DimIdx, unsigned EndIdx, 8434 unsigned NumGradients) { 8435 SDLoc DL(Op); 8436 for (unsigned I = DimIdx; I < EndIdx; I++) { 8437 SDValue Addr = Op.getOperand(I); 8438 8439 // Gradients are packed with undef for each coordinate. 8440 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: 8441 // 1D: undef,dx/dh; undef,dx/dv 8442 // 2D: dy/dh,dx/dh; dy/dv,dx/dv 8443 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv 8444 if (((I + 1) >= EndIdx) || 8445 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || 8446 I == DimIdx + NumGradients - 1))) { 8447 if (Addr.getValueType() != MVT::i16) 8448 Addr = DAG.getBitcast(MVT::i16, Addr); 8449 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); 8450 } else { 8451 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)}); 8452 I++; 8453 } 8454 Addr = DAG.getBitcast(MVT::f32, Addr); 8455 PackedAddrs.push_back(Addr); 8456 } 8457 } 8458 8459 SDValue SITargetLowering::lowerImage(SDValue Op, 8460 const AMDGPU::ImageDimIntrinsicInfo *Intr, 8461 SelectionDAG &DAG, bool WithChain) const { 8462 SDLoc DL(Op); 8463 MachineFunction &MF = DAG.getMachineFunction(); 8464 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); 8465 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 8466 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 8467 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 8468 unsigned IntrOpcode = Intr->BaseOpcode; 8469 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); 8470 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); 8471 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 8472 8473 SmallVector<EVT, 3> ResultTypes(Op->values()); 8474 SmallVector<EVT, 3> OrigResultTypes(Op->values()); 8475 bool IsD16 = false; 8476 bool IsG16 = false; 8477 bool IsA16 = false; 8478 SDValue VData; 8479 int NumVDataDwords = 0; 8480 bool AdjustRetType = false; 8481 bool IsAtomicPacked16Bit = false; 8482 8483 // Offset of intrinsic arguments 8484 const unsigned ArgOffset = WithChain ? 2 : 1; 8485 8486 unsigned DMask; 8487 unsigned DMaskLanes = 0; 8488 8489 if (BaseOpcode->Atomic) { 8490 VData = Op.getOperand(2); 8491 8492 IsAtomicPacked16Bit = 8493 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 8494 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 8495 8496 bool Is64Bit = VData.getValueSizeInBits() == 64; 8497 if (BaseOpcode->AtomicX2) { 8498 SDValue VData2 = Op.getOperand(3); 8499 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, 8500 {VData, VData2}); 8501 if (Is64Bit) 8502 VData = DAG.getBitcast(MVT::v4i32, VData); 8503 8504 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; 8505 DMask = Is64Bit ? 0xf : 0x3; 8506 NumVDataDwords = Is64Bit ? 4 : 2; 8507 } else { 8508 DMask = Is64Bit ? 0x3 : 0x1; 8509 NumVDataDwords = Is64Bit ? 2 : 1; 8510 } 8511 } else { 8512 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex); 8513 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); 8514 8515 if (BaseOpcode->Store) { 8516 VData = Op.getOperand(2); 8517 8518 MVT StoreVT = VData.getSimpleValueType(); 8519 if (StoreVT.getScalarType() == MVT::f16) { 8520 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) 8521 return Op; // D16 is unsupported for this instruction 8522 8523 IsD16 = true; 8524 VData = handleD16VData(VData, DAG, true); 8525 } 8526 8527 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; 8528 } else if (!BaseOpcode->NoReturn) { 8529 // Work out the num dwords based on the dmask popcount and underlying type 8530 // and whether packing is supported. 8531 MVT LoadVT = ResultTypes[0].getSimpleVT(); 8532 if (LoadVT.getScalarType() == MVT::f16) { 8533 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) 8534 return Op; // D16 is unsupported for this instruction 8535 8536 IsD16 = true; 8537 } 8538 8539 // Confirm that the return type is large enough for the dmask specified 8540 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || 8541 (!LoadVT.isVector() && DMaskLanes > 1)) 8542 return Op; 8543 8544 // The sq block of gfx8 and gfx9 do not estimate register use correctly 8545 // for d16 image_gather4, image_gather4_l, and image_gather4_lz 8546 // instructions. 8547 if (IsD16 && !Subtarget->hasUnpackedD16VMem() && 8548 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) 8549 NumVDataDwords = (DMaskLanes + 1) / 2; 8550 else 8551 NumVDataDwords = DMaskLanes; 8552 8553 AdjustRetType = true; 8554 } 8555 } 8556 8557 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; 8558 SmallVector<SDValue, 4> VAddrs; 8559 8560 // Check for 16 bit addresses or derivatives and pack if true. 8561 MVT VAddrVT = 8562 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); 8563 MVT VAddrScalarVT = VAddrVT.getScalarType(); 8564 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; 8565 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; 8566 8567 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); 8568 VAddrScalarVT = VAddrVT.getScalarType(); 8569 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; 8570 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; 8571 8572 // Push back extra arguments. 8573 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { 8574 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { 8575 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 8576 // Special handling of bias when A16 is on. Bias is of type half but 8577 // occupies full 32-bit. 8578 SDValue Bias = DAG.getBuildVector( 8579 MVT::v2f16, DL, 8580 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)}); 8581 VAddrs.push_back(Bias); 8582 } else { 8583 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 8584 "Bias needs to be converted to 16 bit in A16 mode"); 8585 VAddrs.push_back(Op.getOperand(ArgOffset + I)); 8586 } 8587 } 8588 8589 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { 8590 // 16 bit gradients are supported, but are tied to the A16 control 8591 // so both gradients and addresses must be 16 bit 8592 LLVM_DEBUG( 8593 dbgs() << "Failed to lower image intrinsic: 16 bit addresses " 8594 "require 16 bit args for both gradients and addresses"); 8595 return Op; 8596 } 8597 8598 if (IsA16) { 8599 if (!ST->hasA16()) { 8600 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " 8601 "support 16 bit addresses\n"); 8602 return Op; 8603 } 8604 } 8605 8606 // We've dealt with incorrect input so we know that if IsA16, IsG16 8607 // are set then we have to compress/pack operands (either address, 8608 // gradient or both) 8609 // In the case where a16 and gradients are tied (no G16 support) then we 8610 // have already verified that both IsA16 and IsG16 are true 8611 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { 8612 // Activate g16 8613 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 8614 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 8615 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 8616 } 8617 8618 // Add gradients (packed or unpacked) 8619 if (IsG16) { 8620 // Pack the gradients 8621 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); 8622 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs, 8623 ArgOffset + Intr->GradientStart, 8624 ArgOffset + Intr->CoordStart, Intr->NumGradients); 8625 } else { 8626 for (unsigned I = ArgOffset + Intr->GradientStart; 8627 I < ArgOffset + Intr->CoordStart; I++) 8628 VAddrs.push_back(Op.getOperand(I)); 8629 } 8630 8631 // Add addresses (packed or unpacked) 8632 if (IsA16) { 8633 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs, 8634 ArgOffset + Intr->CoordStart, VAddrEnd, 8635 0 /* No gradients */); 8636 } else { 8637 // Add uncompressed address 8638 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) 8639 VAddrs.push_back(Op.getOperand(I)); 8640 } 8641 8642 // If the register allocator cannot place the address registers contiguously 8643 // without introducing moves, then using the non-sequential address encoding 8644 // is always preferable, since it saves VALU instructions and is usually a 8645 // wash in terms of code size or even better. 8646 // 8647 // However, we currently have no way of hinting to the register allocator that 8648 // MIMG addresses should be placed contiguously when it is possible to do so, 8649 // so force non-NSA for the common 2-address case as a heuristic. 8650 // 8651 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 8652 // allocation when possible. 8653 // 8654 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 8655 // set of the remaining addresses. 8656 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler); 8657 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); 8658 const bool UseNSA = ST->hasNSAEncoding() && 8659 VAddrs.size() >= ST->getNSAThreshold(MF) && 8660 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding); 8661 const bool UsePartialNSA = 8662 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize; 8663 8664 SDValue VAddr; 8665 if (UsePartialNSA) { 8666 VAddr = getBuildDwordsVector(DAG, DL, 8667 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1)); 8668 } else if (!UseNSA) { 8669 VAddr = getBuildDwordsVector(DAG, DL, VAddrs); 8670 } 8671 8672 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); 8673 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); 8674 SDValue Unorm; 8675 if (!BaseOpcode->Sampler) { 8676 Unorm = True; 8677 } else { 8678 uint64_t UnormConst = 8679 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex); 8680 8681 Unorm = UnormConst ? True : False; 8682 } 8683 8684 SDValue TFE; 8685 SDValue LWE; 8686 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex); 8687 bool IsTexFail = false; 8688 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) 8689 return Op; 8690 8691 if (IsTexFail) { 8692 if (!DMaskLanes) { 8693 // Expecting to get an error flag since TFC is on - and dmask is 0 8694 // Force dmask to be at least 1 otherwise the instruction will fail 8695 DMask = 0x1; 8696 DMaskLanes = 1; 8697 NumVDataDwords = 1; 8698 } 8699 NumVDataDwords += 1; 8700 AdjustRetType = true; 8701 } 8702 8703 // Has something earlier tagged that the return type needs adjusting 8704 // This happens if the instruction is a load or has set TexFailCtrl flags 8705 if (AdjustRetType) { 8706 // NumVDataDwords reflects the true number of dwords required in the return 8707 // type 8708 if (DMaskLanes == 0 && !BaseOpcode->Store) { 8709 // This is a no-op load. This can be eliminated 8710 SDValue Undef = DAG.getPOISON(Op.getValueType()); 8711 if (isa<MemSDNode>(Op)) 8712 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); 8713 return Undef; 8714 } 8715 8716 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), 8717 MVT::i32, NumVDataDwords) 8718 : MVT::i32; 8719 8720 ResultTypes[0] = NewVT; 8721 if (ResultTypes.size() == 3) { 8722 // Original result was aggregate type used for TexFailCtrl results 8723 // The actual instruction returns as a vector type which has now been 8724 // created. Remove the aggregate result. 8725 ResultTypes.erase(&ResultTypes[1]); 8726 } 8727 } 8728 8729 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); 8730 if (BaseOpcode->Atomic) 8731 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 8732 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | 8733 AMDGPU::CPol::VOLATILE)) 8734 return Op; 8735 8736 SmallVector<SDValue, 26> Ops; 8737 if (BaseOpcode->Store || BaseOpcode->Atomic) 8738 Ops.push_back(VData); // vdata 8739 if (UsePartialNSA) { 8740 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1)); 8741 Ops.push_back(VAddr); 8742 } else if (UseNSA) 8743 append_range(Ops, VAddrs); 8744 else 8745 Ops.push_back(VAddr); 8746 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex); 8747 EVT RsrcVT = Rsrc.getValueType(); 8748 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32) 8749 return Op; 8750 Ops.push_back(Rsrc); 8751 if (BaseOpcode->Sampler) { 8752 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex); 8753 if (Samp.getValueType() != MVT::v4i32) 8754 return Op; 8755 Ops.push_back(Samp); 8756 } 8757 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); 8758 if (IsGFX10Plus) 8759 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); 8760 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) 8761 Ops.push_back(Unorm); 8762 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); 8763 Ops.push_back(IsA16 && // r128, a16 for gfx9 8764 ST->hasFeature(AMDGPU::FeatureR128A16) 8765 ? True 8766 : False); 8767 if (IsGFX10Plus) 8768 Ops.push_back(IsA16 ? True : False); 8769 8770 if (!Subtarget->hasGFX90AInsts()) 8771 Ops.push_back(TFE); // tfe 8772 else if (TFE->getAsZExtVal()) { 8773 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 8774 DAG.getMachineFunction().getFunction(), 8775 "TFE is not supported on this GPU", DL.getDebugLoc())); 8776 } 8777 8778 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) 8779 Ops.push_back(LWE); // lwe 8780 if (!IsGFX10Plus) 8781 Ops.push_back(DimInfo->DA ? True : False); 8782 if (BaseOpcode->HasD16) 8783 Ops.push_back(IsD16 ? True : False); 8784 if (isa<MemSDNode>(Op)) 8785 Ops.push_back(Op.getOperand(0)); // chain 8786 8787 int NumVAddrDwords = 8788 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; 8789 int Opcode = -1; 8790 8791 if (IsGFX12Plus) { 8792 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, 8793 NumVDataDwords, NumVAddrDwords); 8794 } else if (IsGFX11Plus) { 8795 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 8796 UseNSA ? AMDGPU::MIMGEncGfx11NSA 8797 : AMDGPU::MIMGEncGfx11Default, 8798 NumVDataDwords, NumVAddrDwords); 8799 } else if (IsGFX10Plus) { 8800 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 8801 UseNSA ? AMDGPU::MIMGEncGfx10NSA 8802 : AMDGPU::MIMGEncGfx10Default, 8803 NumVDataDwords, NumVAddrDwords); 8804 } else { 8805 if (Subtarget->hasGFX90AInsts()) { 8806 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, 8807 NumVDataDwords, NumVAddrDwords); 8808 if (Opcode == -1) { 8809 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 8810 DAG.getMachineFunction().getFunction(), 8811 "requested image instruction is not supported on this GPU", 8812 DL.getDebugLoc())); 8813 8814 unsigned Idx = 0; 8815 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size()); 8816 for (EVT VT : OrigResultTypes) { 8817 if (VT == MVT::Other) 8818 RetValues[Idx++] = Op.getOperand(0); // Chain 8819 else 8820 RetValues[Idx++] = DAG.getPOISON(VT); 8821 } 8822 8823 return DAG.getMergeValues(RetValues, DL); 8824 } 8825 } 8826 if (Opcode == -1 && 8827 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8828 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 8829 NumVDataDwords, NumVAddrDwords); 8830 if (Opcode == -1) 8831 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 8832 NumVDataDwords, NumVAddrDwords); 8833 } 8834 if (Opcode == -1) 8835 return Op; 8836 8837 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); 8838 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) { 8839 MachineMemOperand *MemRef = MemOp->getMemOperand(); 8840 DAG.setNodeMemRefs(NewNode, {MemRef}); 8841 } 8842 8843 if (BaseOpcode->AtomicX2) { 8844 SmallVector<SDValue, 1> Elt; 8845 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); 8846 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); 8847 } 8848 if (BaseOpcode->NoReturn) 8849 return SDValue(NewNode, 0); 8850 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, 8851 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, 8852 NumVDataDwords, IsAtomicPacked16Bit, DL); 8853 } 8854 8855 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, 8856 SDValue Offset, SDValue CachePolicy, 8857 SelectionDAG &DAG) const { 8858 MachineFunction &MF = DAG.getMachineFunction(); 8859 8860 const DataLayout &DataLayout = DAG.getDataLayout(); 8861 Align Alignment = 8862 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); 8863 8864 MachineMemOperand *MMO = MF.getMachineMemOperand( 8865 MachinePointerInfo(), 8866 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 8867 MachineMemOperand::MOInvariant, 8868 VT.getStoreSize(), Alignment); 8869 8870 if (!Offset->isDivergent()) { 8871 SDValue Ops[] = {Rsrc, Offset, CachePolicy}; 8872 8873 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the 8874 // s_buffer_load_u16 instruction is emitted for both signed and unsigned 8875 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext 8876 // and generates s_buffer_load_i16 (performSignExtendInRegCombine). 8877 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { 8878 SDValue BufferLoad = 8879 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL, 8880 DAG.getVTList(MVT::i32), Ops, VT, MMO); 8881 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 8882 } 8883 8884 // Widen vec3 load to vec4. 8885 if (VT.isVector() && VT.getVectorNumElements() == 3 && 8886 !Subtarget->hasScalarDwordx3Loads()) { 8887 EVT WidenedVT = 8888 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 8889 auto WidenedOp = DAG.getMemIntrinsicNode( 8890 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, 8891 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); 8892 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, 8893 DAG.getVectorIdxConstant(0, DL)); 8894 return Subvector; 8895 } 8896 8897 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, 8898 DAG.getVTList(VT), Ops, VT, MMO); 8899 } 8900 8901 // We have a divergent offset. Emit a MUBUF buffer load instead. We can 8902 // assume that the buffer is unswizzled. 8903 SDValue Ops[] = { 8904 DAG.getEntryNode(), // Chain 8905 Rsrc, // rsrc 8906 DAG.getConstant(0, DL, MVT::i32), // vindex 8907 {}, // voffset 8908 {}, // soffset 8909 {}, // offset 8910 CachePolicy, // cachepolicy 8911 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 8912 }; 8913 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { 8914 setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); 8915 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); 8916 } 8917 8918 SmallVector<SDValue, 4> Loads; 8919 unsigned NumLoads = 1; 8920 MVT LoadVT = VT.getSimpleVT(); 8921 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; 8922 assert((LoadVT.getScalarType() == MVT::i32 || 8923 LoadVT.getScalarType() == MVT::f32)); 8924 8925 if (NumElts == 8 || NumElts == 16) { 8926 NumLoads = NumElts / 4; 8927 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4); 8928 } 8929 8930 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other}); 8931 8932 // Use the alignment to ensure that the required offsets will fit into the 8933 // immediate offsets. 8934 setBufferOffsets(Offset, DAG, &Ops[3], 8935 NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); 8936 8937 uint64_t InstOffset = Ops[5]->getAsZExtVal(); 8938 for (unsigned i = 0; i < NumLoads; ++i) { 8939 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); 8940 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, 8941 LoadVT, MMO, DAG)); 8942 } 8943 8944 if (NumElts == 8 || NumElts == 16) 8945 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); 8946 8947 return Loads[0]; 8948 } 8949 8950 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { 8951 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 8952 if (!Subtarget->hasArchitectedSGPRs()) 8953 return {}; 8954 SDLoc SL(Op); 8955 MVT VT = MVT::i32; 8956 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT); 8957 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8, 8958 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT)); 8959 } 8960 8961 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, 8962 unsigned Dim, 8963 const ArgDescriptor &Arg) const { 8964 SDLoc SL(Op); 8965 MachineFunction &MF = DAG.getMachineFunction(); 8966 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); 8967 if (MaxID == 0) 8968 return DAG.getConstant(0, SL, MVT::i32); 8969 8970 // It's undefined behavior if a function marked with the amdgpu-no-* 8971 // attributes uses the corresponding intrinsic. 8972 if (!Arg) 8973 return DAG.getPOISON(Op->getValueType(0)); 8974 8975 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, 8976 SDLoc(DAG.getEntryNode()), Arg); 8977 8978 // Don't bother inserting AssertZext for packed IDs since we're emitting the 8979 // masking operations anyway. 8980 // 8981 // TODO: We could assert the top bit is 0 for the source copy. 8982 if (Arg.isMasked()) 8983 return Val; 8984 8985 // Preserve the known bits after expansion to a copy. 8986 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID)); 8987 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val, 8988 DAG.getValueType(SmallVT)); 8989 } 8990 8991 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8992 SelectionDAG &DAG) const { 8993 MachineFunction &MF = DAG.getMachineFunction(); 8994 auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 8995 8996 EVT VT = Op.getValueType(); 8997 SDLoc DL(Op); 8998 unsigned IntrinsicID = Op.getConstantOperandVal(0); 8999 9000 // TODO: Should this propagate fast-math-flags? 9001 9002 switch (IntrinsicID) { 9003 case Intrinsic::amdgcn_implicit_buffer_ptr: { 9004 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction())) 9005 return emitNonHSAIntrinsicError(DAG, DL, VT); 9006 return getPreloadedValue(DAG, *MFI, VT, 9007 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 9008 } 9009 case Intrinsic::amdgcn_dispatch_ptr: 9010 case Intrinsic::amdgcn_queue_ptr: { 9011 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) { 9012 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 9013 MF.getFunction(), "unsupported hsa intrinsic without hsa target", 9014 DL.getDebugLoc())); 9015 return DAG.getPOISON(VT); 9016 } 9017 9018 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr 9019 ? AMDGPUFunctionArgInfo::DISPATCH_PTR 9020 : AMDGPUFunctionArgInfo::QUEUE_PTR; 9021 return getPreloadedValue(DAG, *MFI, VT, RegID); 9022 } 9023 case Intrinsic::amdgcn_implicitarg_ptr: { 9024 if (MFI->isEntryFunction()) 9025 return getImplicitArgPtr(DAG, DL); 9026 return getPreloadedValue(DAG, *MFI, VT, 9027 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 9028 } 9029 case Intrinsic::amdgcn_kernarg_segment_ptr: { 9030 if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { 9031 // This only makes sense to call in a kernel, so just lower to null. 9032 return DAG.getConstant(0, DL, VT); 9033 } 9034 9035 return getPreloadedValue(DAG, *MFI, VT, 9036 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 9037 } 9038 case Intrinsic::amdgcn_dispatch_id: { 9039 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); 9040 } 9041 case Intrinsic::amdgcn_rcp: 9042 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 9043 case Intrinsic::amdgcn_rsq: 9044 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 9045 case Intrinsic::amdgcn_rsq_legacy: 9046 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 9047 return emitRemovedIntrinsicError(DAG, DL, VT); 9048 return SDValue(); 9049 case Intrinsic::amdgcn_rcp_legacy: 9050 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 9051 return emitRemovedIntrinsicError(DAG, DL, VT); 9052 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 9053 case Intrinsic::amdgcn_rsq_clamp: { 9054 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 9055 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 9056 9057 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 9058 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 9059 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 9060 9061 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 9062 SDValue Tmp = 9063 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT)); 9064 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 9065 DAG.getConstantFP(Min, DL, VT)); 9066 } 9067 case Intrinsic::r600_read_ngroups_x: 9068 if (Subtarget->isAmdHsaOS()) 9069 return emitNonHSAIntrinsicError(DAG, DL, VT); 9070 9071 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 9072 SI::KernelInputOffsets::NGROUPS_X, Align(4), 9073 false); 9074 case Intrinsic::r600_read_ngroups_y: 9075 if (Subtarget->isAmdHsaOS()) 9076 return emitNonHSAIntrinsicError(DAG, DL, VT); 9077 9078 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 9079 SI::KernelInputOffsets::NGROUPS_Y, Align(4), 9080 false); 9081 case Intrinsic::r600_read_ngroups_z: 9082 if (Subtarget->isAmdHsaOS()) 9083 return emitNonHSAIntrinsicError(DAG, DL, VT); 9084 9085 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 9086 SI::KernelInputOffsets::NGROUPS_Z, Align(4), 9087 false); 9088 case Intrinsic::r600_read_local_size_x: 9089 if (Subtarget->isAmdHsaOS()) 9090 return emitNonHSAIntrinsicError(DAG, DL, VT); 9091 9092 return lowerImplicitZextParam(DAG, Op, MVT::i16, 9093 SI::KernelInputOffsets::LOCAL_SIZE_X); 9094 case Intrinsic::r600_read_local_size_y: 9095 if (Subtarget->isAmdHsaOS()) 9096 return emitNonHSAIntrinsicError(DAG, DL, VT); 9097 9098 return lowerImplicitZextParam(DAG, Op, MVT::i16, 9099 SI::KernelInputOffsets::LOCAL_SIZE_Y); 9100 case Intrinsic::r600_read_local_size_z: 9101 if (Subtarget->isAmdHsaOS()) 9102 return emitNonHSAIntrinsicError(DAG, DL, VT); 9103 9104 return lowerImplicitZextParam(DAG, Op, MVT::i16, 9105 SI::KernelInputOffsets::LOCAL_SIZE_Z); 9106 case Intrinsic::amdgcn_workgroup_id_x: 9107 return getPreloadedValue(DAG, *MFI, VT, 9108 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 9109 case Intrinsic::amdgcn_workgroup_id_y: 9110 return getPreloadedValue(DAG, *MFI, VT, 9111 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 9112 case Intrinsic::amdgcn_workgroup_id_z: 9113 return getPreloadedValue(DAG, *MFI, VT, 9114 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 9115 case Intrinsic::amdgcn_wave_id: 9116 return lowerWaveID(DAG, Op); 9117 case Intrinsic::amdgcn_lds_kernel_id: { 9118 if (MFI->isEntryFunction()) 9119 return getLDSKernelId(DAG, DL); 9120 return getPreloadedValue(DAG, *MFI, VT, 9121 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 9122 } 9123 case Intrinsic::amdgcn_workitem_id_x: 9124 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); 9125 case Intrinsic::amdgcn_workitem_id_y: 9126 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY); 9127 case Intrinsic::amdgcn_workitem_id_z: 9128 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ); 9129 case Intrinsic::amdgcn_wavefrontsize: 9130 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), 9131 SDLoc(Op), MVT::i32); 9132 case Intrinsic::amdgcn_s_buffer_load: { 9133 unsigned CPol = Op.getConstantOperandVal(3); 9134 // s_buffer_load, because of how it's optimized, can't be volatile 9135 // so reject ones with the volatile bit set. 9136 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) 9137 ? AMDGPU::CPol::ALL 9138 : AMDGPU::CPol::ALL_pregfx12)) 9139 return Op; 9140 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), 9141 Op.getOperand(3), DAG); 9142 } 9143 case Intrinsic::amdgcn_fdiv_fast: 9144 return lowerFDIV_FAST(Op, DAG); 9145 case Intrinsic::amdgcn_sin: 9146 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 9147 9148 case Intrinsic::amdgcn_cos: 9149 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 9150 9151 case Intrinsic::amdgcn_mul_u24: 9152 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), 9153 Op.getOperand(2)); 9154 case Intrinsic::amdgcn_mul_i24: 9155 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), 9156 Op.getOperand(2)); 9157 9158 case Intrinsic::amdgcn_log_clamp: { 9159 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 9160 return SDValue(); 9161 9162 return emitRemovedIntrinsicError(DAG, DL, VT); 9163 } 9164 case Intrinsic::amdgcn_fract: 9165 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 9166 9167 case Intrinsic::amdgcn_class: 9168 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), 9169 Op.getOperand(2)); 9170 case Intrinsic::amdgcn_div_fmas: 9171 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1), 9172 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); 9173 9174 case Intrinsic::amdgcn_div_fixup: 9175 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1), 9176 Op.getOperand(2), Op.getOperand(3)); 9177 9178 case Intrinsic::amdgcn_div_scale: { 9179 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); 9180 9181 // Translate to the operands expected by the machine instruction. The 9182 // first parameter must be the same as the first instruction. 9183 SDValue Numerator = Op.getOperand(1); 9184 SDValue Denominator = Op.getOperand(2); 9185 9186 // Note this order is opposite of the machine instruction's operations, 9187 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 9188 // intrinsic has the numerator as the first operand to match a normal 9189 // division operation. 9190 9191 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; 9192 9193 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 9194 Denominator, Numerator); 9195 } 9196 case Intrinsic::amdgcn_icmp: { 9197 // There is a Pat that handles this variant, so return it as-is. 9198 if (Op.getOperand(1).getValueType() == MVT::i1 && 9199 Op.getConstantOperandVal(2) == 0 && 9200 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE) 9201 return Op; 9202 return lowerICMPIntrinsic(*this, Op.getNode(), DAG); 9203 } 9204 case Intrinsic::amdgcn_fcmp: { 9205 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG); 9206 } 9207 case Intrinsic::amdgcn_ballot: 9208 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG); 9209 case Intrinsic::amdgcn_fmed3: 9210 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), 9211 Op.getOperand(2), Op.getOperand(3)); 9212 case Intrinsic::amdgcn_fdot2: 9213 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1), 9214 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); 9215 case Intrinsic::amdgcn_fmul_legacy: 9216 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), 9217 Op.getOperand(2)); 9218 case Intrinsic::amdgcn_sffbh: 9219 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 9220 case Intrinsic::amdgcn_sbfe: 9221 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), 9222 Op.getOperand(2), Op.getOperand(3)); 9223 case Intrinsic::amdgcn_ubfe: 9224 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1), 9225 Op.getOperand(2), Op.getOperand(3)); 9226 case Intrinsic::amdgcn_cvt_pkrtz: 9227 case Intrinsic::amdgcn_cvt_pknorm_i16: 9228 case Intrinsic::amdgcn_cvt_pknorm_u16: 9229 case Intrinsic::amdgcn_cvt_pk_i16: 9230 case Intrinsic::amdgcn_cvt_pk_u16: { 9231 // FIXME: Stop adding cast if v2f16/v2i16 are legal. 9232 EVT VT = Op.getValueType(); 9233 unsigned Opcode; 9234 9235 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) 9236 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; 9237 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) 9238 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 9239 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) 9240 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 9241 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) 9242 Opcode = AMDGPUISD::CVT_PK_I16_I32; 9243 else 9244 Opcode = AMDGPUISD::CVT_PK_U16_U32; 9245 9246 if (isTypeLegal(VT)) 9247 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2)); 9248 9249 SDValue Node = 9250 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2)); 9251 return DAG.getNode(ISD::BITCAST, DL, VT, Node); 9252 } 9253 case Intrinsic::amdgcn_fmad_ftz: 9254 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), 9255 Op.getOperand(2), Op.getOperand(3)); 9256 9257 case Intrinsic::amdgcn_if_break: 9258 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, 9259 Op->getOperand(1), Op->getOperand(2)), 9260 0); 9261 9262 case Intrinsic::amdgcn_groupstaticsize: { 9263 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); 9264 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 9265 return Op; 9266 9267 const Module *M = MF.getFunction().getParent(); 9268 const GlobalValue *GV = 9269 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize); 9270 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, 9271 SIInstrInfo::MO_ABS32_LO); 9272 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; 9273 } 9274 case Intrinsic::amdgcn_is_shared: 9275 case Intrinsic::amdgcn_is_private: { 9276 SDLoc SL(Op); 9277 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) 9278 ? AMDGPUAS::LOCAL_ADDRESS 9279 : AMDGPUAS::PRIVATE_ADDRESS; 9280 SDValue Aperture = getSegmentAperture(AS, SL, DAG); 9281 SDValue SrcVec = 9282 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 9283 9284 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, 9285 DAG.getConstant(1, SL, MVT::i32)); 9286 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); 9287 } 9288 case Intrinsic::amdgcn_perm: 9289 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), 9290 Op.getOperand(2), Op.getOperand(3)); 9291 case Intrinsic::amdgcn_reloc_constant: { 9292 Module *M = const_cast<Module *>(MF.getFunction().getParent()); 9293 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); 9294 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 9295 auto *RelocSymbol = cast<GlobalVariable>( 9296 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 9297 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, 9298 SIInstrInfo::MO_ABS32_LO); 9299 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; 9300 } 9301 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 9302 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 9303 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 9304 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 9305 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 9306 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 9307 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 9308 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 9309 if (Op.getOperand(4).getValueType() == MVT::i32) 9310 return SDValue(); 9311 9312 SDLoc SL(Op); 9313 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); 9314 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), 9315 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), 9316 Op.getOperand(3), IndexKeyi32); 9317 } 9318 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 9319 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 9320 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 9321 if (Op.getOperand(6).getValueType() == MVT::i32) 9322 return SDValue(); 9323 9324 SDLoc SL(Op); 9325 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); 9326 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), 9327 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), 9328 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), 9329 IndexKeyi32, Op.getOperand(7)}); 9330 } 9331 case Intrinsic::amdgcn_addrspacecast_nonnull: 9332 return lowerADDRSPACECAST(Op, DAG); 9333 case Intrinsic::amdgcn_readlane: 9334 case Intrinsic::amdgcn_readfirstlane: 9335 case Intrinsic::amdgcn_writelane: 9336 case Intrinsic::amdgcn_permlane16: 9337 case Intrinsic::amdgcn_permlanex16: 9338 case Intrinsic::amdgcn_permlane64: 9339 case Intrinsic::amdgcn_set_inactive: 9340 case Intrinsic::amdgcn_set_inactive_chain_arg: 9341 case Intrinsic::amdgcn_mov_dpp8: 9342 case Intrinsic::amdgcn_update_dpp: 9343 return lowerLaneOp(*this, Op.getNode(), DAG); 9344 case Intrinsic::amdgcn_dead: { 9345 SmallVector<SDValue, 8> Poisons; 9346 for (const EVT ValTy : Op.getNode()->values()) 9347 Poisons.push_back(DAG.getPOISON(ValTy)); 9348 return DAG.getMergeValues(Poisons, SDLoc(Op)); 9349 } 9350 default: 9351 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 9352 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) 9353 return lowerImage(Op, ImageDimIntr, DAG, false); 9354 9355 return Op; 9356 } 9357 } 9358 9359 // On targets not supporting constant in soffset field, turn zero to 9360 // SGPR_NULL to avoid generating an extra s_mov with zero. 9361 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, 9362 const GCNSubtarget *Subtarget) { 9363 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset)) 9364 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); 9365 return SOffset; 9366 } 9367 9368 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, 9369 SelectionDAG &DAG, 9370 unsigned NewOpcode) const { 9371 SDLoc DL(Op); 9372 9373 SDValue VData = Op.getOperand(2); 9374 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9375 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9376 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9377 SDValue Ops[] = { 9378 Op.getOperand(0), // Chain 9379 VData, // vdata 9380 Rsrc, // rsrc 9381 DAG.getConstant(0, DL, MVT::i32), // vindex 9382 VOffset, // voffset 9383 SOffset, // soffset 9384 Offset, // offset 9385 Op.getOperand(6), // cachepolicy 9386 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9387 }; 9388 9389 auto *M = cast<MemSDNode>(Op); 9390 9391 EVT MemVT = VData.getValueType(); 9392 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, 9393 M->getMemOperand()); 9394 } 9395 9396 SDValue 9397 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, 9398 unsigned NewOpcode) const { 9399 SDLoc DL(Op); 9400 9401 SDValue VData = Op.getOperand(2); 9402 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9403 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 9404 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9405 SDValue Ops[] = { 9406 Op.getOperand(0), // Chain 9407 VData, // vdata 9408 Rsrc, // rsrc 9409 Op.getOperand(4), // vindex 9410 VOffset, // voffset 9411 SOffset, // soffset 9412 Offset, // offset 9413 Op.getOperand(7), // cachepolicy 9414 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9415 }; 9416 9417 auto *M = cast<MemSDNode>(Op); 9418 9419 EVT MemVT = VData.getValueType(); 9420 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, 9421 M->getMemOperand()); 9422 } 9423 9424 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 9425 SelectionDAG &DAG) const { 9426 unsigned IntrID = Op.getConstantOperandVal(1); 9427 SDLoc DL(Op); 9428 9429 switch (IntrID) { 9430 case Intrinsic::amdgcn_ds_ordered_add: 9431 case Intrinsic::amdgcn_ds_ordered_swap: { 9432 MemSDNode *M = cast<MemSDNode>(Op); 9433 SDValue Chain = M->getOperand(0); 9434 SDValue M0 = M->getOperand(2); 9435 SDValue Value = M->getOperand(3); 9436 unsigned IndexOperand = M->getConstantOperandVal(7); 9437 unsigned WaveRelease = M->getConstantOperandVal(8); 9438 unsigned WaveDone = M->getConstantOperandVal(9); 9439 9440 unsigned OrderedCountIndex = IndexOperand & 0x3f; 9441 IndexOperand &= ~0x3f; 9442 unsigned CountDw = 0; 9443 9444 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { 9445 CountDw = (IndexOperand >> 24) & 0xf; 9446 IndexOperand &= ~(0xf << 24); 9447 9448 if (CountDw < 1 || CountDw > 4) { 9449 const Function &Fn = DAG.getMachineFunction().getFunction(); 9450 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 9451 Fn, "ds_ordered_count: dword count must be between 1 and 4", 9452 DL.getDebugLoc())); 9453 CountDw = 1; 9454 } 9455 } 9456 9457 if (IndexOperand) { 9458 const Function &Fn = DAG.getMachineFunction().getFunction(); 9459 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 9460 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc())); 9461 } 9462 9463 if (WaveDone && !WaveRelease) { 9464 // TODO: Move this to IR verifier 9465 const Function &Fn = DAG.getMachineFunction().getFunction(); 9466 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 9467 Fn, "ds_ordered_count: wave_done requires wave_release", 9468 DL.getDebugLoc())); 9469 } 9470 9471 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 9472 unsigned ShaderType = 9473 SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction()); 9474 unsigned Offset0 = OrderedCountIndex << 2; 9475 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); 9476 9477 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) 9478 Offset1 |= (CountDw - 1) << 6; 9479 9480 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) 9481 Offset1 |= ShaderType << 2; 9482 9483 unsigned Offset = Offset0 | (Offset1 << 8); 9484 9485 SDValue Ops[] = { 9486 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16), 9487 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue 9488 }; 9489 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, 9490 M->getVTList(), Ops, M->getMemoryVT(), 9491 M->getMemOperand()); 9492 } 9493 case Intrinsic::amdgcn_raw_buffer_load: 9494 case Intrinsic::amdgcn_raw_ptr_buffer_load: 9495 case Intrinsic::amdgcn_raw_atomic_buffer_load: 9496 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 9497 case Intrinsic::amdgcn_raw_buffer_load_format: 9498 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { 9499 const bool IsFormat = 9500 IntrID == Intrinsic::amdgcn_raw_buffer_load_format || 9501 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format; 9502 9503 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9504 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG); 9505 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); 9506 SDValue Ops[] = { 9507 Op.getOperand(0), // Chain 9508 Rsrc, // rsrc 9509 DAG.getConstant(0, DL, MVT::i32), // vindex 9510 VOffset, // voffset 9511 SOffset, // soffset 9512 Offset, // offset 9513 Op.getOperand(5), // cachepolicy, swizzled buffer 9514 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9515 }; 9516 9517 auto *M = cast<MemSDNode>(Op); 9518 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); 9519 } 9520 case Intrinsic::amdgcn_struct_buffer_load: 9521 case Intrinsic::amdgcn_struct_ptr_buffer_load: 9522 case Intrinsic::amdgcn_struct_buffer_load_format: 9523 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 9524 case Intrinsic::amdgcn_struct_atomic_buffer_load: 9525 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: { 9526 const bool IsFormat = 9527 IntrID == Intrinsic::amdgcn_struct_buffer_load_format || 9528 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format; 9529 9530 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9531 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9532 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9533 SDValue Ops[] = { 9534 Op.getOperand(0), // Chain 9535 Rsrc, // rsrc 9536 Op.getOperand(3), // vindex 9537 VOffset, // voffset 9538 SOffset, // soffset 9539 Offset, // offset 9540 Op.getOperand(6), // cachepolicy, swizzled buffer 9541 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9542 }; 9543 9544 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); 9545 } 9546 case Intrinsic::amdgcn_raw_tbuffer_load: 9547 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 9548 MemSDNode *M = cast<MemSDNode>(Op); 9549 EVT LoadVT = Op.getValueType(); 9550 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9551 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG); 9552 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); 9553 9554 SDValue Ops[] = { 9555 Op.getOperand(0), // Chain 9556 Rsrc, // rsrc 9557 DAG.getConstant(0, DL, MVT::i32), // vindex 9558 VOffset, // voffset 9559 SOffset, // soffset 9560 Offset, // offset 9561 Op.getOperand(5), // format 9562 Op.getOperand(6), // cachepolicy, swizzled buffer 9563 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9564 }; 9565 9566 if (LoadVT.getScalarType() == MVT::f16) 9567 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, 9568 Ops); 9569 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 9570 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), 9571 DAG); 9572 } 9573 case Intrinsic::amdgcn_struct_tbuffer_load: 9574 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { 9575 MemSDNode *M = cast<MemSDNode>(Op); 9576 EVT LoadVT = Op.getValueType(); 9577 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9578 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9579 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9580 9581 SDValue Ops[] = { 9582 Op.getOperand(0), // Chain 9583 Rsrc, // rsrc 9584 Op.getOperand(3), // vindex 9585 VOffset, // voffset 9586 SOffset, // soffset 9587 Offset, // offset 9588 Op.getOperand(6), // format 9589 Op.getOperand(7), // cachepolicy, swizzled buffer 9590 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9591 }; 9592 9593 if (LoadVT.getScalarType() == MVT::f16) 9594 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, 9595 Ops); 9596 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 9597 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), 9598 DAG); 9599 } 9600 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 9601 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 9602 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); 9603 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 9604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 9605 return lowerStructBufferAtomicIntrin(Op, DAG, 9606 AMDGPUISD::BUFFER_ATOMIC_FADD); 9607 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 9608 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 9609 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); 9610 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 9611 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 9612 return lowerStructBufferAtomicIntrin(Op, DAG, 9613 AMDGPUISD::BUFFER_ATOMIC_FMIN); 9614 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 9615 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 9616 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); 9617 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 9618 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 9619 return lowerStructBufferAtomicIntrin(Op, DAG, 9620 AMDGPUISD::BUFFER_ATOMIC_FMAX); 9621 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 9622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 9623 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); 9624 case Intrinsic::amdgcn_raw_buffer_atomic_add: 9625 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 9626 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); 9627 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 9628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 9629 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); 9630 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 9631 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 9632 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN); 9633 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 9634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 9635 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN); 9636 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 9637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 9638 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX); 9639 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 9640 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 9641 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX); 9642 case Intrinsic::amdgcn_raw_buffer_atomic_and: 9643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 9644 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); 9645 case Intrinsic::amdgcn_raw_buffer_atomic_or: 9646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 9647 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); 9648 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 9649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 9650 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); 9651 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 9652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 9653 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); 9654 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 9655 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 9656 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); 9657 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 9658 return lowerRawBufferAtomicIntrin(Op, DAG, 9659 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); 9660 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 9661 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 9662 return lowerStructBufferAtomicIntrin(Op, DAG, 9663 AMDGPUISD::BUFFER_ATOMIC_SWAP); 9664 case Intrinsic::amdgcn_struct_buffer_atomic_add: 9665 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 9666 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); 9667 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 9668 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 9669 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); 9670 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 9671 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 9672 return lowerStructBufferAtomicIntrin(Op, DAG, 9673 AMDGPUISD::BUFFER_ATOMIC_SMIN); 9674 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 9675 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 9676 return lowerStructBufferAtomicIntrin(Op, DAG, 9677 AMDGPUISD::BUFFER_ATOMIC_UMIN); 9678 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 9679 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 9680 return lowerStructBufferAtomicIntrin(Op, DAG, 9681 AMDGPUISD::BUFFER_ATOMIC_SMAX); 9682 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 9683 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 9684 return lowerStructBufferAtomicIntrin(Op, DAG, 9685 AMDGPUISD::BUFFER_ATOMIC_UMAX); 9686 case Intrinsic::amdgcn_struct_buffer_atomic_and: 9687 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 9688 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); 9689 case Intrinsic::amdgcn_struct_buffer_atomic_or: 9690 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 9691 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); 9692 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 9693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 9694 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); 9695 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 9696 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 9697 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); 9698 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 9699 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 9700 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); 9701 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 9702 return lowerStructBufferAtomicIntrin(Op, DAG, 9703 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); 9704 9705 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 9706 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { 9707 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); 9708 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 9709 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9710 SDValue Ops[] = { 9711 Op.getOperand(0), // Chain 9712 Op.getOperand(2), // src 9713 Op.getOperand(3), // cmp 9714 Rsrc, // rsrc 9715 DAG.getConstant(0, DL, MVT::i32), // vindex 9716 VOffset, // voffset 9717 SOffset, // soffset 9718 Offset, // offset 9719 Op.getOperand(7), // cachepolicy 9720 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9721 }; 9722 EVT VT = Op.getValueType(); 9723 auto *M = cast<MemSDNode>(Op); 9724 9725 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 9726 Op->getVTList(), Ops, VT, 9727 M->getMemOperand()); 9728 } 9729 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 9730 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { 9731 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); 9732 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG); 9733 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget); 9734 SDValue Ops[] = { 9735 Op.getOperand(0), // Chain 9736 Op.getOperand(2), // src 9737 Op.getOperand(3), // cmp 9738 Rsrc, // rsrc 9739 Op.getOperand(5), // vindex 9740 VOffset, // voffset 9741 SOffset, // soffset 9742 Offset, // offset 9743 Op.getOperand(8), // cachepolicy 9744 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9745 }; 9746 EVT VT = Op.getValueType(); 9747 auto *M = cast<MemSDNode>(Op); 9748 9749 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 9750 Op->getVTList(), Ops, VT, 9751 M->getMemOperand()); 9752 } 9753 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: 9754 case Intrinsic::amdgcn_image_bvh8_intersect_ray: { 9755 MemSDNode *M = cast<MemSDNode>(Op); 9756 SDValue NodePtr = M->getOperand(2); 9757 SDValue RayExtent = M->getOperand(3); 9758 SDValue InstanceMask = M->getOperand(4); 9759 SDValue RayOrigin = M->getOperand(5); 9760 SDValue RayDir = M->getOperand(6); 9761 SDValue Offsets = M->getOperand(7); 9762 SDValue TDescr = M->getOperand(8); 9763 9764 assert(NodePtr.getValueType() == MVT::i64); 9765 assert(RayDir.getValueType() == MVT::v3f32); 9766 9767 if (!Subtarget->hasBVHDualAndBVH8Insts()) { 9768 emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); 9769 return SDValue(); 9770 } 9771 9772 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray; 9773 const unsigned NumVDataDwords = 10; 9774 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12; 9775 int Opcode = AMDGPU::getMIMGOpcode( 9776 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY 9777 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, 9778 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords); 9779 assert(Opcode != -1); 9780 9781 SmallVector<SDValue, 7> Ops; 9782 Ops.push_back(NodePtr); 9783 Ops.push_back(DAG.getBuildVector( 9784 MVT::v2i32, DL, 9785 {DAG.getBitcast(MVT::i32, RayExtent), 9786 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)})); 9787 Ops.push_back(RayOrigin); 9788 Ops.push_back(RayDir); 9789 Ops.push_back(Offsets); 9790 Ops.push_back(TDescr); 9791 Ops.push_back(M->getChain()); 9792 9793 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); 9794 MachineMemOperand *MemRef = M->getMemOperand(); 9795 DAG.setNodeMemRefs(NewNode, {MemRef}); 9796 return SDValue(NewNode, 0); 9797 } 9798 case Intrinsic::amdgcn_image_bvh_intersect_ray: { 9799 MemSDNode *M = cast<MemSDNode>(Op); 9800 SDValue NodePtr = M->getOperand(2); 9801 SDValue RayExtent = M->getOperand(3); 9802 SDValue RayOrigin = M->getOperand(4); 9803 SDValue RayDir = M->getOperand(5); 9804 SDValue RayInvDir = M->getOperand(6); 9805 SDValue TDescr = M->getOperand(7); 9806 9807 assert(NodePtr.getValueType() == MVT::i32 || 9808 NodePtr.getValueType() == MVT::i64); 9809 assert(RayDir.getValueType() == MVT::v3f16 || 9810 RayDir.getValueType() == MVT::v3f32); 9811 9812 if (!Subtarget->hasGFX10_AEncoding()) { 9813 emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); 9814 return SDValue(); 9815 } 9816 9817 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget); 9818 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); 9819 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 9820 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; 9821 const bool Is64 = NodePtr.getValueType() == MVT::i64; 9822 const unsigned NumVDataDwords = 4; 9823 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 9824 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 9825 const bool UseNSA = (Subtarget->hasNSAEncoding() && 9826 NumVAddrs <= Subtarget->getNSAMaxSize()) || 9827 IsGFX12Plus; 9828 const unsigned BaseOpcodes[2][2] = { 9829 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 9830 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 9831 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 9832 int Opcode; 9833 if (UseNSA) { 9834 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 9835 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 9836 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 9837 : AMDGPU::MIMGEncGfx10NSA, 9838 NumVDataDwords, NumVAddrDwords); 9839 } else { 9840 assert(!IsGFX12Plus); 9841 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 9842 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 9843 : AMDGPU::MIMGEncGfx10Default, 9844 NumVDataDwords, NumVAddrDwords); 9845 } 9846 assert(Opcode != -1); 9847 9848 SmallVector<SDValue, 16> Ops; 9849 9850 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) { 9851 SmallVector<SDValue, 3> Lanes; 9852 DAG.ExtractVectorElements(Op, Lanes, 0, 3); 9853 if (Lanes[0].getValueSizeInBits() == 32) { 9854 for (unsigned I = 0; I < 3; ++I) 9855 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); 9856 } else { 9857 if (IsAligned) { 9858 Ops.push_back(DAG.getBitcast( 9859 MVT::i32, 9860 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]}))); 9861 Ops.push_back(Lanes[2]); 9862 } else { 9863 SDValue Elt0 = Ops.pop_back_val(); 9864 Ops.push_back(DAG.getBitcast( 9865 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]}))); 9866 Ops.push_back(DAG.getBitcast( 9867 MVT::i32, 9868 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]}))); 9869 } 9870 } 9871 }; 9872 9873 if (UseNSA && IsGFX11Plus) { 9874 Ops.push_back(NodePtr); 9875 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); 9876 Ops.push_back(RayOrigin); 9877 if (IsA16) { 9878 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes; 9879 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3); 9880 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3); 9881 for (unsigned I = 0; I < 3; ++I) { 9882 MergedLanes.push_back(DAG.getBitcast( 9883 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, 9884 {DirLanes[I], InvDirLanes[I]}))); 9885 } 9886 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes)); 9887 } else { 9888 Ops.push_back(RayDir); 9889 Ops.push_back(RayInvDir); 9890 } 9891 } else { 9892 if (Is64) 9893 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 9894 2); 9895 else 9896 Ops.push_back(NodePtr); 9897 9898 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); 9899 packLanes(RayOrigin, true); 9900 packLanes(RayDir, true); 9901 packLanes(RayInvDir, false); 9902 } 9903 9904 if (!UseNSA) { 9905 // Build a single vector containing all the operands so far prepared. 9906 if (NumVAddrDwords > 12) { 9907 SDValue Undef = DAG.getPOISON(MVT::i32); 9908 Ops.append(16 - Ops.size(), Undef); 9909 } 9910 assert(Ops.size() >= 8 && Ops.size() <= 12); 9911 SDValue MergedOps = 9912 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops); 9913 Ops.clear(); 9914 Ops.push_back(MergedOps); 9915 } 9916 9917 Ops.push_back(TDescr); 9918 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1)); 9919 Ops.push_back(M->getChain()); 9920 9921 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); 9922 MachineMemOperand *MemRef = M->getMemOperand(); 9923 DAG.setNodeMemRefs(NewNode, {MemRef}); 9924 return SDValue(NewNode, 0); 9925 } 9926 case Intrinsic::amdgcn_global_atomic_fmin_num: 9927 case Intrinsic::amdgcn_global_atomic_fmax_num: 9928 case Intrinsic::amdgcn_flat_atomic_fmin_num: 9929 case Intrinsic::amdgcn_flat_atomic_fmax_num: { 9930 MemSDNode *M = cast<MemSDNode>(Op); 9931 SDValue Ops[] = { 9932 M->getOperand(0), // Chain 9933 M->getOperand(2), // Ptr 9934 M->getOperand(3) // Value 9935 }; 9936 unsigned Opcode = 0; 9937 switch (IntrID) { 9938 case Intrinsic::amdgcn_global_atomic_fmin_num: 9939 case Intrinsic::amdgcn_flat_atomic_fmin_num: { 9940 Opcode = ISD::ATOMIC_LOAD_FMIN; 9941 break; 9942 } 9943 case Intrinsic::amdgcn_global_atomic_fmax_num: 9944 case Intrinsic::amdgcn_flat_atomic_fmax_num: { 9945 Opcode = ISD::ATOMIC_LOAD_FMAX; 9946 break; 9947 } 9948 default: 9949 llvm_unreachable("unhandled atomic opcode"); 9950 } 9951 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), 9952 Ops, M->getMemOperand()); 9953 } 9954 case Intrinsic::amdgcn_s_get_barrier_state: 9955 case Intrinsic::amdgcn_s_get_named_barrier_state: { 9956 SDValue Chain = Op->getOperand(0); 9957 SmallVector<SDValue, 2> Ops; 9958 unsigned Opc; 9959 9960 if (isa<ConstantSDNode>(Op->getOperand(2))) { 9961 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue(); 9962 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) 9963 BarID = (BarID >> 4) & 0x3F; 9964 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; 9965 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); 9966 Ops.push_back(K); 9967 Ops.push_back(Chain); 9968 } else { 9969 Opc = AMDGPU::S_GET_BARRIER_STATE_M0; 9970 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) { 9971 SDValue M0Val; 9972 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2), 9973 DAG.getShiftAmountConstant(4, MVT::i32, DL)); 9974 M0Val = SDValue( 9975 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, 9976 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 9977 0); 9978 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); 9979 } else 9980 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0)); 9981 } 9982 9983 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 9984 return SDValue(NewMI, 0); 9985 } 9986 default: 9987 9988 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 9989 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 9990 return lowerImage(Op, ImageDimIntr, DAG, true); 9991 9992 return SDValue(); 9993 } 9994 } 9995 9996 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to 9997 // dwordx4 if on SI and handle TFE loads. 9998 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, 9999 SDVTList VTList, 10000 ArrayRef<SDValue> Ops, EVT MemVT, 10001 MachineMemOperand *MMO, 10002 SelectionDAG &DAG) const { 10003 LLVMContext &C = *DAG.getContext(); 10004 MachineFunction &MF = DAG.getMachineFunction(); 10005 EVT VT = VTList.VTs[0]; 10006 10007 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3); 10008 bool IsTFE = VTList.NumVTs == 3; 10009 if (IsTFE) { 10010 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32); 10011 unsigned NumOpDWords = NumValueDWords + 1; 10012 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords); 10013 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]); 10014 MachineMemOperand *OpDWordsMMO = 10015 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4); 10016 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops, 10017 OpDWordsVT, OpDWordsMMO, DAG); 10018 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10019 DAG.getVectorIdxConstant(NumValueDWords, DL)); 10020 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); 10021 SDValue ValueDWords = 10022 NumValueDWords == 1 10023 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx) 10024 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, 10025 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op, 10026 ZeroIdx); 10027 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords); 10028 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); 10029 } 10030 10031 if (!Subtarget->hasDwordx3LoadStores() && 10032 (VT == MVT::v3i32 || VT == MVT::v3f32)) { 10033 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4); 10034 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4); 10035 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16); 10036 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); 10037 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, 10038 WidenedMemVT, WidenedMMO); 10039 SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op, 10040 DAG.getVectorIdxConstant(0, DL)); 10041 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL); 10042 } 10043 10044 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO); 10045 } 10046 10047 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, 10048 bool ImageStore) const { 10049 EVT StoreVT = VData.getValueType(); 10050 10051 // No change for f16 and legal vector D16 types. 10052 if (!StoreVT.isVector()) 10053 return VData; 10054 10055 SDLoc DL(VData); 10056 unsigned NumElements = StoreVT.getVectorNumElements(); 10057 10058 if (Subtarget->hasUnpackedD16VMem()) { 10059 // We need to unpack the packed data to store. 10060 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 10061 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 10062 10063 EVT EquivStoreVT = 10064 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); 10065 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); 10066 return DAG.UnrollVectorOp(ZExt.getNode()); 10067 } 10068 10069 // The sq block of gfx8.1 does not estimate register use correctly for d16 10070 // image store instructions. The data operand is computed as if it were not a 10071 // d16 image instruction. 10072 if (ImageStore && Subtarget->hasImageStoreD16Bug()) { 10073 // Bitcast to i16 10074 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 10075 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 10076 10077 // Decompose into scalars 10078 SmallVector<SDValue, 4> Elts; 10079 DAG.ExtractVectorElements(IntVData, Elts); 10080 10081 // Group pairs of i16 into v2i16 and bitcast to i32 10082 SmallVector<SDValue, 4> PackedElts; 10083 for (unsigned I = 0; I < Elts.size() / 2; I += 1) { 10084 SDValue Pair = 10085 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]}); 10086 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); 10087 PackedElts.push_back(IntPair); 10088 } 10089 if ((NumElements % 2) == 1) { 10090 // Handle v3i16 10091 unsigned I = Elts.size() / 2; 10092 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL, 10093 {Elts[I * 2], DAG.getPOISON(MVT::i16)}); 10094 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); 10095 PackedElts.push_back(IntPair); 10096 } 10097 10098 // Pad using UNDEF 10099 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32)); 10100 10101 // Build final vector 10102 EVT VecVT = 10103 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size()); 10104 return DAG.getBuildVector(VecVT, DL, PackedElts); 10105 } 10106 10107 if (NumElements == 3) { 10108 EVT IntStoreVT = 10109 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); 10110 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 10111 10112 EVT WidenedStoreVT = EVT::getVectorVT( 10113 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); 10114 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), 10115 WidenedStoreVT.getStoreSizeInBits()); 10116 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); 10117 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); 10118 } 10119 10120 assert(isTypeLegal(StoreVT)); 10121 return VData; 10122 } 10123 10124 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 10125 SelectionDAG &DAG) const { 10126 SDLoc DL(Op); 10127 SDValue Chain = Op.getOperand(0); 10128 unsigned IntrinsicID = Op.getConstantOperandVal(1); 10129 MachineFunction &MF = DAG.getMachineFunction(); 10130 10131 switch (IntrinsicID) { 10132 case Intrinsic::amdgcn_exp_compr: { 10133 if (!Subtarget->hasCompressedExport()) { 10134 DAG.getContext()->diagnose(DiagnosticInfoUnsupported( 10135 DAG.getMachineFunction().getFunction(), 10136 "intrinsic not supported on subtarget", DL.getDebugLoc())); 10137 } 10138 SDValue Src0 = Op.getOperand(4); 10139 SDValue Src1 = Op.getOperand(5); 10140 // Hack around illegal type on SI by directly selecting it. 10141 if (isTypeLegal(Src0.getValueType())) 10142 return SDValue(); 10143 10144 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); 10145 SDValue Undef = DAG.getPOISON(MVT::f32); 10146 const SDValue Ops[] = { 10147 Op.getOperand(2), // tgt 10148 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0 10149 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1 10150 Undef, // src2 10151 Undef, // src3 10152 Op.getOperand(7), // vm 10153 DAG.getTargetConstant(1, DL, MVT::i1), // compr 10154 Op.getOperand(3), // en 10155 Op.getOperand(0) // Chain 10156 }; 10157 10158 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; 10159 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); 10160 } 10161 case Intrinsic::amdgcn_s_barrier: 10162 case Intrinsic::amdgcn_s_barrier_signal: 10163 case Intrinsic::amdgcn_s_barrier_wait: { 10164 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 10165 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { 10166 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; 10167 if (WGSize <= ST.getWavefrontSize()) { 10168 // If the workgroup fits in a wave, remove s_barrier_signal and lower 10169 // s_barrier/s_barrier_wait to wave_barrier. 10170 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal) 10171 return Op.getOperand(0); 10172 else 10173 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, 10174 MVT::Other, Op.getOperand(0)), 10175 0); 10176 } 10177 } 10178 10179 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { 10180 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 10181 SDValue K = 10182 DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); 10183 SDValue BarSignal = 10184 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, 10185 MVT::Other, K, Op.getOperand(0)), 10186 0); 10187 SDValue BarWait = 10188 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, 10189 BarSignal.getValue(0)), 10190 0); 10191 return BarWait; 10192 } 10193 10194 return SDValue(); 10195 }; 10196 10197 case Intrinsic::amdgcn_struct_tbuffer_store: 10198 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 10199 SDValue VData = Op.getOperand(2); 10200 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 10201 if (IsD16) 10202 VData = handleD16VData(VData, DAG); 10203 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 10204 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 10205 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 10206 SDValue Ops[] = { 10207 Chain, 10208 VData, // vdata 10209 Rsrc, // rsrc 10210 Op.getOperand(4), // vindex 10211 VOffset, // voffset 10212 SOffset, // soffset 10213 Offset, // offset 10214 Op.getOperand(7), // format 10215 Op.getOperand(8), // cachepolicy, swizzled buffer 10216 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 10217 }; 10218 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 10219 : AMDGPUISD::TBUFFER_STORE_FORMAT; 10220 MemSDNode *M = cast<MemSDNode>(Op); 10221 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 10222 M->getMemoryVT(), M->getMemOperand()); 10223 } 10224 10225 case Intrinsic::amdgcn_raw_tbuffer_store: 10226 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 10227 SDValue VData = Op.getOperand(2); 10228 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 10229 if (IsD16) 10230 VData = handleD16VData(VData, DAG); 10231 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 10232 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 10233 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 10234 SDValue Ops[] = { 10235 Chain, 10236 VData, // vdata 10237 Rsrc, // rsrc 10238 DAG.getConstant(0, DL, MVT::i32), // vindex 10239 VOffset, // voffset 10240 SOffset, // soffset 10241 Offset, // offset 10242 Op.getOperand(6), // format 10243 Op.getOperand(7), // cachepolicy, swizzled buffer 10244 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 10245 }; 10246 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 10247 : AMDGPUISD::TBUFFER_STORE_FORMAT; 10248 MemSDNode *M = cast<MemSDNode>(Op); 10249 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 10250 M->getMemoryVT(), M->getMemOperand()); 10251 } 10252 10253 case Intrinsic::amdgcn_raw_buffer_store: 10254 case Intrinsic::amdgcn_raw_ptr_buffer_store: 10255 case Intrinsic::amdgcn_raw_buffer_store_format: 10256 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: { 10257 const bool IsFormat = 10258 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format || 10259 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format; 10260 10261 SDValue VData = Op.getOperand(2); 10262 EVT VDataVT = VData.getValueType(); 10263 EVT EltType = VDataVT.getScalarType(); 10264 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 10265 if (IsD16) { 10266 VData = handleD16VData(VData, DAG); 10267 VDataVT = VData.getValueType(); 10268 } 10269 10270 if (!isTypeLegal(VDataVT)) { 10271 VData = 10272 DAG.getNode(ISD::BITCAST, DL, 10273 getEquivalentMemType(*DAG.getContext(), VDataVT), VData); 10274 } 10275 10276 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 10277 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 10278 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 10279 SDValue Ops[] = { 10280 Chain, 10281 VData, 10282 Rsrc, 10283 DAG.getConstant(0, DL, MVT::i32), // vindex 10284 VOffset, // voffset 10285 SOffset, // soffset 10286 Offset, // offset 10287 Op.getOperand(6), // cachepolicy, swizzled buffer 10288 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 10289 }; 10290 unsigned Opc = 10291 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; 10292 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 10293 MemSDNode *M = cast<MemSDNode>(Op); 10294 10295 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics 10296 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) 10297 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); 10298 10299 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 10300 M->getMemoryVT(), M->getMemOperand()); 10301 } 10302 10303 case Intrinsic::amdgcn_struct_buffer_store: 10304 case Intrinsic::amdgcn_struct_ptr_buffer_store: 10305 case Intrinsic::amdgcn_struct_buffer_store_format: 10306 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: { 10307 const bool IsFormat = 10308 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format || 10309 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format; 10310 10311 SDValue VData = Op.getOperand(2); 10312 EVT VDataVT = VData.getValueType(); 10313 EVT EltType = VDataVT.getScalarType(); 10314 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 10315 10316 if (IsD16) { 10317 VData = handleD16VData(VData, DAG); 10318 VDataVT = VData.getValueType(); 10319 } 10320 10321 if (!isTypeLegal(VDataVT)) { 10322 VData = 10323 DAG.getNode(ISD::BITCAST, DL, 10324 getEquivalentMemType(*DAG.getContext(), VDataVT), VData); 10325 } 10326 10327 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 10328 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 10329 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 10330 SDValue Ops[] = { 10331 Chain, 10332 VData, 10333 Rsrc, 10334 Op.getOperand(4), // vindex 10335 VOffset, // voffset 10336 SOffset, // soffset 10337 Offset, // offset 10338 Op.getOperand(7), // cachepolicy, swizzled buffer 10339 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 10340 }; 10341 unsigned Opc = 10342 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; 10343 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 10344 MemSDNode *M = cast<MemSDNode>(Op); 10345 10346 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics 10347 EVT VDataType = VData.getValueType().getScalarType(); 10348 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) 10349 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); 10350 10351 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 10352 M->getMemoryVT(), M->getMemOperand()); 10353 } 10354 case Intrinsic::amdgcn_raw_buffer_load_lds: 10355 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 10356 case Intrinsic::amdgcn_struct_buffer_load_lds: 10357 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 10358 if (!Subtarget->hasVMemToLDSLoad()) 10359 return SDValue(); 10360 unsigned Opc; 10361 bool HasVIndex = 10362 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || 10363 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; 10364 unsigned OpOffset = HasVIndex ? 1 : 0; 10365 SDValue VOffset = Op.getOperand(5 + OpOffset); 10366 bool HasVOffset = !isNullConstant(VOffset); 10367 unsigned Size = Op->getConstantOperandVal(4); 10368 10369 switch (Size) { 10370 default: 10371 return SDValue(); 10372 case 1: 10373 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 10374 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 10375 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 10376 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 10377 break; 10378 case 2: 10379 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 10380 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 10381 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 10382 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 10383 break; 10384 case 4: 10385 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 10386 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 10387 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 10388 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 10389 break; 10390 case 12: 10391 if (!Subtarget->hasLDSLoadB96_B128()) 10392 return SDValue(); 10393 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN 10394 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN 10395 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN 10396 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET; 10397 break; 10398 case 16: 10399 if (!Subtarget->hasLDSLoadB96_B128()) 10400 return SDValue(); 10401 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN 10402 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN 10403 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN 10404 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET; 10405 break; 10406 } 10407 10408 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 10409 10410 SmallVector<SDValue, 8> Ops; 10411 10412 if (HasVIndex && HasVOffset) 10413 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL, 10414 {Op.getOperand(5), // VIndex 10415 VOffset})); 10416 else if (HasVIndex) 10417 Ops.push_back(Op.getOperand(5)); 10418 else if (HasVOffset) 10419 Ops.push_back(VOffset); 10420 10421 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 10422 Ops.push_back(Rsrc); 10423 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset 10424 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset 10425 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 10426 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); 10427 Ops.push_back(DAG.getTargetConstant( 10428 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12), 10429 DL, MVT::i8)); // cpol 10430 Ops.push_back(DAG.getTargetConstant( 10431 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) 10432 ? 1 10433 : 0, 10434 DL, MVT::i8)); // swz 10435 Ops.push_back(M0Val.getValue(0)); // Chain 10436 Ops.push_back(M0Val.getValue(1)); // Glue 10437 10438 auto *M = cast<MemSDNode>(Op); 10439 MachineMemOperand *LoadMMO = M->getMemOperand(); 10440 // Don't set the offset value here because the pointer points to the base of 10441 // the buffer. 10442 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 10443 10444 MachinePointerInfo StorePtrI = LoadPtrI; 10445 LoadPtrI.V = PoisonValue::get( 10446 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); 10447 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 10448 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 10449 10450 auto F = LoadMMO->getFlags() & 10451 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 10452 LoadMMO = 10453 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, 10454 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 10455 10456 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 10457 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), 10458 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 10459 10460 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); 10461 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); 10462 10463 return SDValue(Load, 0); 10464 } 10465 // Buffers are handled by LowerBufferFatPointers, and we're going to go 10466 // for "trust me" that the remaining cases are global pointers until 10467 // such time as we can put two mem operands on an intrinsic. 10468 case Intrinsic::amdgcn_load_to_lds: 10469 case Intrinsic::amdgcn_global_load_lds: { 10470 if (!Subtarget->hasVMemToLDSLoad()) 10471 return SDValue(); 10472 10473 unsigned Opc; 10474 unsigned Size = Op->getConstantOperandVal(4); 10475 switch (Size) { 10476 default: 10477 return SDValue(); 10478 case 1: 10479 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 10480 break; 10481 case 2: 10482 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 10483 break; 10484 case 4: 10485 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 10486 break; 10487 case 12: 10488 if (!Subtarget->hasLDSLoadB96_B128()) 10489 return SDValue(); 10490 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3; 10491 break; 10492 case 16: 10493 if (!Subtarget->hasLDSLoadB96_B128()) 10494 return SDValue(); 10495 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4; 10496 break; 10497 } 10498 10499 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 10500 10501 SmallVector<SDValue, 6> Ops; 10502 10503 SDValue Addr = Op.getOperand(2); // Global ptr 10504 SDValue VOffset; 10505 // Try to split SAddr and VOffset. Global and LDS pointers share the same 10506 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 10507 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { 10508 SDValue LHS = Addr.getOperand(0); 10509 SDValue RHS = Addr.getOperand(1); 10510 10511 if (LHS->isDivergent()) 10512 std::swap(LHS, RHS); 10513 10514 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && 10515 RHS.getOperand(0).getValueType() == MVT::i32) { 10516 // add (i64 sgpr), (zero_extend (i32 vgpr)) 10517 Addr = LHS; 10518 VOffset = RHS.getOperand(0); 10519 } 10520 } 10521 10522 Ops.push_back(Addr); 10523 if (!Addr->isDivergent()) { 10524 Opc = AMDGPU::getGlobalSaddrOp(Opc); 10525 if (!VOffset) 10526 VOffset = 10527 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, 10528 DAG.getTargetConstant(0, DL, MVT::i32)), 10529 0); 10530 Ops.push_back(VOffset); 10531 } 10532 10533 Ops.push_back(Op.getOperand(5)); // Offset 10534 Ops.push_back(Op.getOperand(6)); // CPol 10535 Ops.push_back(M0Val.getValue(0)); // Chain 10536 Ops.push_back(M0Val.getValue(1)); // Glue 10537 10538 auto *M = cast<MemSDNode>(Op); 10539 MachineMemOperand *LoadMMO = M->getMemOperand(); 10540 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 10541 LoadPtrI.Offset = Op->getConstantOperandVal(5); 10542 MachinePointerInfo StorePtrI = LoadPtrI; 10543 LoadPtrI.V = PoisonValue::get( 10544 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); 10545 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 10546 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 10547 auto F = LoadMMO->getFlags() & 10548 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 10549 LoadMMO = 10550 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, 10551 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 10552 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 10553 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), 10554 LoadMMO->getAAInfo()); 10555 10556 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 10557 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); 10558 10559 return SDValue(Load, 0); 10560 } 10561 case Intrinsic::amdgcn_end_cf: 10562 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, 10563 Op->getOperand(2), Chain), 10564 0); 10565 case Intrinsic::amdgcn_s_barrier_signal_var: { 10566 // these two intrinsics have two operands: barrier pointer and member count 10567 SDValue Chain = Op->getOperand(0); 10568 SmallVector<SDValue, 2> Ops; 10569 SDValue BarOp = Op->getOperand(2); 10570 SDValue CntOp = Op->getOperand(3); 10571 SDValue M0Val; 10572 // extract the BarrierID from bits 4-9 of BarOp 10573 SDValue BarID; 10574 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, 10575 DAG.getShiftAmountConstant(4, MVT::i32, DL)); 10576 BarID = 10577 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID, 10578 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 10579 0); 10580 // Member count should be put into M0[ShAmt:+6] 10581 // Barrier ID should be put into M0[5:0] 10582 M0Val = 10583 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp, 10584 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 10585 0); 10586 constexpr unsigned ShAmt = 16; 10587 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp, 10588 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL)); 10589 10590 M0Val = SDValue( 10591 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0); 10592 10593 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); 10594 10595 auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL, 10596 Op->getVTList(), Ops); 10597 return SDValue(NewMI, 0); 10598 } 10599 case Intrinsic::amdgcn_s_prefetch_data: { 10600 // For non-global address space preserve the chain and remove the call. 10601 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace())) 10602 return Op.getOperand(0); 10603 return Op; 10604 } 10605 case Intrinsic::amdgcn_s_buffer_prefetch_data: { 10606 SDValue Ops[] = { 10607 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG), 10608 Op.getOperand(3), // offset 10609 Op.getOperand(4), // length 10610 }; 10611 10612 MemSDNode *M = cast<MemSDNode>(Op); 10613 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL, 10614 Op->getVTList(), Ops, M->getMemoryVT(), 10615 M->getMemOperand()); 10616 } 10617 default: { 10618 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 10619 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) 10620 return lowerImage(Op, ImageDimIntr, DAG, true); 10621 10622 return Op; 10623 } 10624 } 10625 } 10626 10627 bool SITargetLowering::shouldPreservePtrArith(const Function &F, 10628 EVT PtrVT) const { 10629 return UseSelectionDAGPTRADD && PtrVT == MVT::i64; 10630 } 10631 10632 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 10633 // offset (the offset that is included in bounds checking and swizzling, to be 10634 // split between the instruction's voffset and immoffset fields) and soffset 10635 // (the offset that is excluded from bounds checking and swizzling, to go in 10636 // the instruction's soffset field). This function takes the first kind of 10637 // offset and figures out how to split it between voffset and immoffset. 10638 std::pair<SDValue, SDValue> 10639 SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { 10640 SDLoc DL(Offset); 10641 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 10642 SDValue N0 = Offset; 10643 ConstantSDNode *C1 = nullptr; 10644 10645 if ((C1 = dyn_cast<ConstantSDNode>(N0))) 10646 N0 = SDValue(); 10647 else if (DAG.isBaseWithConstantOffset(N0)) { 10648 C1 = cast<ConstantSDNode>(N0.getOperand(1)); 10649 N0 = N0.getOperand(0); 10650 } 10651 10652 if (C1) { 10653 unsigned ImmOffset = C1->getZExtValue(); 10654 // If the immediate value is too big for the immoffset field, put only bits 10655 // that would normally fit in the immoffset field. The remaining value that 10656 // is copied/added for the voffset field is a large power of 2, and it 10657 // stands more chance of being CSEd with the copy/add for another similar 10658 // load/store. 10659 // However, do not do that rounding down if that is a negative 10660 // number, as it appears to be illegal to have a negative offset in the 10661 // vgpr, even if adding the immediate offset makes it positive. 10662 unsigned Overflow = ImmOffset & ~MaxImm; 10663 ImmOffset -= Overflow; 10664 if ((int32_t)Overflow < 0) { 10665 Overflow += ImmOffset; 10666 ImmOffset = 0; 10667 } 10668 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); 10669 if (Overflow) { 10670 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); 10671 if (!N0) 10672 N0 = OverflowVal; 10673 else { 10674 SDValue Ops[] = {N0, OverflowVal}; 10675 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops); 10676 } 10677 } 10678 } 10679 if (!N0) 10680 N0 = DAG.getConstant(0, DL, MVT::i32); 10681 if (!C1) 10682 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); 10683 return {N0, SDValue(C1, 0)}; 10684 } 10685 10686 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store 10687 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array 10688 // pointed to by Offsets. 10689 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, 10690 SelectionDAG &DAG, SDValue *Offsets, 10691 Align Alignment) const { 10692 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 10693 SDLoc DL(CombinedOffset); 10694 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) { 10695 uint32_t Imm = C->getZExtValue(); 10696 uint32_t SOffset, ImmOffset; 10697 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { 10698 Offsets[0] = DAG.getConstant(0, DL, MVT::i32); 10699 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); 10700 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); 10701 return; 10702 } 10703 } 10704 if (DAG.isBaseWithConstantOffset(CombinedOffset)) { 10705 SDValue N0 = CombinedOffset.getOperand(0); 10706 SDValue N1 = CombinedOffset.getOperand(1); 10707 uint32_t SOffset, ImmOffset; 10708 int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); 10709 if (Offset >= 0 && 10710 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 10711 Offsets[0] = N0; 10712 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); 10713 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); 10714 return; 10715 } 10716 } 10717 10718 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() 10719 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) 10720 : DAG.getConstant(0, DL, MVT::i32); 10721 10722 Offsets[0] = CombinedOffset; 10723 Offsets[1] = SOffsetZero; 10724 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); 10725 } 10726 10727 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, 10728 SelectionDAG &DAG) const { 10729 if (!MaybePointer.getValueType().isScalarInteger()) 10730 return MaybePointer; 10731 10732 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer); 10733 return Rsrc; 10734 } 10735 10736 // Wrap a global or flat pointer into a buffer intrinsic using the flags 10737 // specified in the intrinsic. 10738 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, 10739 SelectionDAG &DAG) const { 10740 SDLoc Loc(Op); 10741 10742 SDValue Pointer = Op->getOperand(1); 10743 SDValue Stride = Op->getOperand(2); 10744 SDValue NumRecords = Op->getOperand(3); 10745 SDValue Flags = Op->getOperand(4); 10746 10747 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); 10748 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); 10749 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); 10750 std::optional<uint32_t> ConstStride = std::nullopt; 10751 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride)) 10752 ConstStride = ConstNode->getZExtValue(); 10753 10754 SDValue NewHighHalf = Masked; 10755 if (!ConstStride || *ConstStride != 0) { 10756 SDValue ShiftedStride; 10757 if (ConstStride) { 10758 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); 10759 } else { 10760 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); 10761 ShiftedStride = 10762 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, 10763 DAG.getShiftAmountConstant(16, MVT::i32, Loc)); 10764 } 10765 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); 10766 } 10767 10768 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, 10769 NewHighHalf, NumRecords, Flags); 10770 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); 10771 return RsrcPtr; 10772 } 10773 10774 // Handle 8 bit and 16 bit buffer loads 10775 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, 10776 EVT LoadVT, SDLoc DL, 10777 ArrayRef<SDValue> Ops, 10778 MachineMemOperand *MMO, 10779 bool IsTFE) const { 10780 EVT IntVT = LoadVT.changeTypeToInteger(); 10781 10782 if (IsTFE) { 10783 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) 10784 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE 10785 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE; 10786 MachineFunction &MF = DAG.getMachineFunction(); 10787 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8); 10788 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other); 10789 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG); 10790 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10791 DAG.getConstant(1, DL, MVT::i32)); 10792 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10793 DAG.getConstant(0, DL, MVT::i32)); 10794 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data); 10795 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); 10796 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); 10797 } 10798 10799 unsigned Opc = LoadVT.getScalarType() == MVT::i8 10800 ? AMDGPUISD::BUFFER_LOAD_UBYTE 10801 : AMDGPUISD::BUFFER_LOAD_USHORT; 10802 10803 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); 10804 SDValue BufferLoad = 10805 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO); 10806 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); 10807 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); 10808 10809 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); 10810 } 10811 10812 // Handle 8 bit and 16 bit buffer stores 10813 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, 10814 EVT VDataType, SDLoc DL, 10815 SDValue Ops[], 10816 MemSDNode *M) const { 10817 if (VDataType == MVT::f16 || VDataType == MVT::bf16) 10818 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); 10819 10820 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); 10821 Ops[1] = BufferStoreExt; 10822 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE 10823 : AMDGPUISD::BUFFER_STORE_SHORT; 10824 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9); 10825 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, 10826 M->getMemOperand()); 10827 } 10828 10829 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, 10830 SDValue Op, const SDLoc &SL, EVT VT) { 10831 if (VT.bitsLT(Op.getValueType())) 10832 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); 10833 10834 switch (ExtType) { 10835 case ISD::SEXTLOAD: 10836 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op); 10837 case ISD::ZEXTLOAD: 10838 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op); 10839 case ISD::EXTLOAD: 10840 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op); 10841 case ISD::NON_EXTLOAD: 10842 return Op; 10843 } 10844 10845 llvm_unreachable("invalid ext type"); 10846 } 10847 10848 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads. 10849 // TODO: Skip this on GFX12 which does have scalar sub-dword loads. 10850 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, 10851 DAGCombinerInfo &DCI) const { 10852 SelectionDAG &DAG = DCI.DAG; 10853 if (Ld->getAlign() < Align(4) || Ld->isDivergent()) 10854 return SDValue(); 10855 10856 // FIXME: Constant loads should all be marked invariant. 10857 unsigned AS = Ld->getAddressSpace(); 10858 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 10859 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT && 10860 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) 10861 return SDValue(); 10862 10863 // Don't do this early, since it may interfere with adjacent load merging for 10864 // illegal types. We can avoid losing alignment information for exotic types 10865 // pre-legalize. 10866 EVT MemVT = Ld->getMemoryVT(); 10867 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || 10868 MemVT.getSizeInBits() >= 32) 10869 return SDValue(); 10870 10871 SDLoc SL(Ld); 10872 10873 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && 10874 "unexpected vector extload"); 10875 10876 // TODO: Drop only high part of range. 10877 SDValue Ptr = Ld->getBasePtr(); 10878 SDValue NewLoad = DAG.getLoad( 10879 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, 10880 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(), 10881 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), 10882 nullptr); // Drop ranges 10883 10884 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 10885 if (MemVT.isFloatingPoint()) { 10886 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && 10887 "unexpected fp extload"); 10888 TruncVT = MemVT.changeTypeToInteger(); 10889 } 10890 10891 SDValue Cvt = NewLoad; 10892 if (Ld->getExtensionType() == ISD::SEXTLOAD) { 10893 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, 10894 DAG.getValueType(TruncVT)); 10895 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || 10896 Ld->getExtensionType() == ISD::NON_EXTLOAD) { 10897 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); 10898 } else { 10899 assert(Ld->getExtensionType() == ISD::EXTLOAD); 10900 } 10901 10902 EVT VT = Ld->getValueType(0); 10903 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); 10904 10905 DCI.AddToWorklist(Cvt.getNode()); 10906 10907 // We may need to handle exotic cases, such as i16->i64 extloads, so insert 10908 // the appropriate extension from the 32-bit load. 10909 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); 10910 DCI.AddToWorklist(Cvt.getNode()); 10911 10912 // Handle conversion back to floating point if necessary. 10913 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); 10914 10915 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL); 10916 } 10917 10918 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, 10919 const SIMachineFunctionInfo &Info) { 10920 // TODO: Should check if the address can definitely not access stack. 10921 if (Info.isEntryFunction()) 10922 return Info.getUserSGPRInfo().hasFlatScratchInit(); 10923 return true; 10924 } 10925 10926 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 10927 SDLoc DL(Op); 10928 LoadSDNode *Load = cast<LoadSDNode>(Op); 10929 ISD::LoadExtType ExtType = Load->getExtensionType(); 10930 EVT MemVT = Load->getMemoryVT(); 10931 MachineMemOperand *MMO = Load->getMemOperand(); 10932 10933 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 10934 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) 10935 return SDValue(); 10936 10937 // FIXME: Copied from PPC 10938 // First, load into 32 bits, then truncate to 1 bit. 10939 10940 SDValue Chain = Load->getChain(); 10941 SDValue BasePtr = Load->getBasePtr(); 10942 10943 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; 10944 10945 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, 10946 RealMemVT, MMO); 10947 10948 if (!MemVT.isVector()) { 10949 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 10950 NewLD.getValue(1)}; 10951 10952 return DAG.getMergeValues(Ops, DL); 10953 } 10954 10955 SmallVector<SDValue, 3> Elts; 10956 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { 10957 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, 10958 DAG.getConstant(I, DL, MVT::i32)); 10959 10960 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); 10961 } 10962 10963 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)}; 10964 10965 return DAG.getMergeValues(Ops, DL); 10966 } 10967 10968 if (!MemVT.isVector()) 10969 return SDValue(); 10970 10971 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 10972 "Custom lowering for non-i32 vectors hasn't been implemented."); 10973 10974 Align Alignment = Load->getAlign(); 10975 unsigned AS = Load->getAddressSpace(); 10976 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && 10977 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { 10978 return SplitVectorLoad(Op, DAG); 10979 } 10980 10981 MachineFunction &MF = DAG.getMachineFunction(); 10982 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 10983 // If there is a possibility that flat instruction access scratch memory 10984 // then we need to use the same legalization rules we use for private. 10985 if (AS == AMDGPUAS::FLAT_ADDRESS && 10986 !Subtarget->hasMultiDwordFlatScratchAddressing()) 10987 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) 10988 ? AMDGPUAS::PRIVATE_ADDRESS 10989 : AMDGPUAS::GLOBAL_ADDRESS; 10990 10991 unsigned NumElements = MemVT.getVectorNumElements(); 10992 10993 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 10994 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 10995 (AS == AMDGPUAS::GLOBAL_ADDRESS && 10996 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() && 10997 isMemOpHasNoClobberedMemOperand(Load))) { 10998 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) && 10999 Alignment >= Align(4) && NumElements < 32) { 11000 if (MemVT.isPow2VectorType() || 11001 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) 11002 return SDValue(); 11003 return WidenOrSplitVectorLoad(Op, DAG); 11004 } 11005 // Non-uniform loads will be selected to MUBUF instructions, so they 11006 // have the same legalization requirements as global and private 11007 // loads. 11008 // 11009 } 11010 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 11011 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 11012 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { 11013 if (NumElements > 4) 11014 return SplitVectorLoad(Op, DAG); 11015 // v3 loads not supported on SI. 11016 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 11017 return WidenOrSplitVectorLoad(Op, DAG); 11018 11019 // v3 and v4 loads are supported for private and global memory. 11020 return SDValue(); 11021 } 11022 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 11023 // Depending on the setting of the private_element_size field in the 11024 // resource descriptor, we can only make private accesses up to a certain 11025 // size. 11026 switch (Subtarget->getMaxPrivateElementSize()) { 11027 case 4: { 11028 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG); 11029 return DAG.getMergeValues({Op0, Op1}, DL); 11030 } 11031 case 8: 11032 if (NumElements > 2) 11033 return SplitVectorLoad(Op, DAG); 11034 return SDValue(); 11035 case 16: 11036 // Same as global/flat 11037 if (NumElements > 4) 11038 return SplitVectorLoad(Op, DAG); 11039 // v3 loads not supported on SI. 11040 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 11041 return WidenOrSplitVectorLoad(Op, DAG); 11042 11043 return SDValue(); 11044 default: 11045 llvm_unreachable("unsupported private_element_size"); 11046 } 11047 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 11048 unsigned Fast = 0; 11049 auto Flags = Load->getMemOperand()->getFlags(); 11050 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, 11051 Load->getAlign(), Flags, &Fast) && 11052 Fast > 1) 11053 return SDValue(); 11054 11055 if (MemVT.isVector()) 11056 return SplitVectorLoad(Op, DAG); 11057 } 11058 11059 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 11060 MemVT, *Load->getMemOperand())) { 11061 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG); 11062 return DAG.getMergeValues({Op0, Op1}, DL); 11063 } 11064 11065 return SDValue(); 11066 } 11067 11068 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 11069 EVT VT = Op.getValueType(); 11070 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || 11071 VT.getSizeInBits() == 512) 11072 return splitTernaryVectorOp(Op, DAG); 11073 11074 assert(VT.getSizeInBits() == 64); 11075 11076 SDLoc DL(Op); 11077 SDValue Cond = Op.getOperand(0); 11078 11079 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 11080 SDValue One = DAG.getConstant(1, DL, MVT::i32); 11081 11082 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 11083 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 11084 11085 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 11086 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 11087 11088 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 11089 11090 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 11091 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 11092 11093 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 11094 11095 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 11096 return DAG.getNode(ISD::BITCAST, DL, VT, Res); 11097 } 11098 11099 // Catch division cases where we can use shortcuts with rcp and rsq 11100 // instructions. 11101 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 11102 SelectionDAG &DAG) const { 11103 SDLoc SL(Op); 11104 SDValue LHS = Op.getOperand(0); 11105 SDValue RHS = Op.getOperand(1); 11106 EVT VT = Op.getValueType(); 11107 const SDNodeFlags Flags = Op->getFlags(); 11108 11109 bool AllowInaccurateRcp = 11110 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; 11111 11112 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 11113 // Without !fpmath accuracy information, we can't do more because we don't 11114 // know exactly whether rcp is accurate enough to meet !fpmath requirement. 11115 // f16 is always accurate enough 11116 if (!AllowInaccurateRcp && VT != MVT::f16) 11117 return SDValue(); 11118 11119 if (CLHS->isExactlyValue(1.0)) { 11120 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 11121 // the CI documentation has a worst case error of 1 ulp. 11122 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 11123 // use it as long as we aren't trying to use denormals. 11124 // 11125 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 11126 11127 // 1.0 / sqrt(x) -> rsq(x) 11128 11129 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 11130 // error seems really high at 2^29 ULP. 11131 // 1.0 / x -> rcp(x) 11132 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 11133 } 11134 11135 // Same as for 1.0, but expand the sign out of the constant. 11136 if (CLHS->isExactlyValue(-1.0)) { 11137 // -1.0 / x -> rcp (fneg x) 11138 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 11139 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 11140 } 11141 } 11142 11143 // For f16 require afn or arcp. 11144 // For f32 require afn. 11145 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) 11146 return SDValue(); 11147 11148 // Turn into multiply by the reciprocal. 11149 // x / y -> x * (1.0 / y) 11150 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 11151 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); 11152 } 11153 11154 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, 11155 SelectionDAG &DAG) const { 11156 SDLoc SL(Op); 11157 SDValue X = Op.getOperand(0); 11158 SDValue Y = Op.getOperand(1); 11159 EVT VT = Op.getValueType(); 11160 const SDNodeFlags Flags = Op->getFlags(); 11161 11162 bool AllowInaccurateDiv = 11163 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; 11164 if (!AllowInaccurateDiv) 11165 return SDValue(); 11166 11167 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y); 11168 SDValue One = DAG.getConstantFP(1.0, SL, VT); 11169 11170 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y); 11171 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); 11172 11173 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R); 11174 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); 11175 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R); 11176 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R); 11177 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X); 11178 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret); 11179 } 11180 11181 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 11182 EVT VT, SDValue A, SDValue B, SDValue GlueChain, 11183 SDNodeFlags Flags) { 11184 if (GlueChain->getNumValues() <= 1) { 11185 return DAG.getNode(Opcode, SL, VT, A, B, Flags); 11186 } 11187 11188 assert(GlueChain->getNumValues() == 3); 11189 11190 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 11191 switch (Opcode) { 11192 default: 11193 llvm_unreachable("no chain equivalent for opcode"); 11194 case ISD::FMUL: 11195 Opcode = AMDGPUISD::FMUL_W_CHAIN; 11196 break; 11197 } 11198 11199 return DAG.getNode(Opcode, SL, VTList, 11200 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)}, 11201 Flags); 11202 } 11203 11204 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 11205 EVT VT, SDValue A, SDValue B, SDValue C, 11206 SDValue GlueChain, SDNodeFlags Flags) { 11207 if (GlueChain->getNumValues() <= 1) { 11208 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags); 11209 } 11210 11211 assert(GlueChain->getNumValues() == 3); 11212 11213 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 11214 switch (Opcode) { 11215 default: 11216 llvm_unreachable("no chain equivalent for opcode"); 11217 case ISD::FMA: 11218 Opcode = AMDGPUISD::FMA_W_CHAIN; 11219 break; 11220 } 11221 11222 return DAG.getNode(Opcode, SL, VTList, 11223 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)}, 11224 Flags); 11225 } 11226 11227 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { 11228 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 11229 return FastLowered; 11230 11231 SDLoc SL(Op); 11232 SDValue LHS = Op.getOperand(0); 11233 SDValue RHS = Op.getOperand(1); 11234 11235 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 11236 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 11237 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d 11238 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp 11239 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 11240 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp 11241 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 11242 // tmp.u = opx(V_MUL_F32, e32.u, r32.u); 11243 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000) 11244 // q32.u = opx(V_ADD_F32, tmp.u, q32.u); 11245 // q16.u = opx(V_CVT_F16_F32, q32.u); 11246 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) 11247 11248 // We will use ISD::FMA on targets that don't support ISD::FMAD. 11249 unsigned FMADOpCode = 11250 isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA; 11251 11252 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); 11253 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); 11254 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt); 11255 SDValue Rcp = 11256 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags()); 11257 SDValue Quot = 11258 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags()); 11259 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt, 11260 Op->getFlags()); 11261 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags()); 11262 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt, 11263 Op->getFlags()); 11264 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags()); 11265 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp); 11266 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast, 11267 DAG.getConstant(0xff800000, SL, MVT::i32)); 11268 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast); 11269 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags()); 11270 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, 11271 DAG.getTargetConstant(0, SL, MVT::i32)); 11272 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS, 11273 Op->getFlags()); 11274 } 11275 11276 // Faster 2.5 ULP division that does not support denormals. 11277 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 11278 SDNodeFlags Flags = Op->getFlags(); 11279 SDLoc SL(Op); 11280 SDValue LHS = Op.getOperand(1); 11281 SDValue RHS = Op.getOperand(2); 11282 11283 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); 11284 11285 const APFloat K0Val(0x1p+96f); 11286 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 11287 11288 const APFloat K1Val(0x1p-32f); 11289 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 11290 11291 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 11292 11293 EVT SetCCVT = 11294 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 11295 11296 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 11297 11298 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags); 11299 11300 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags); 11301 11302 // rcp does not support denormals. 11303 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags); 11304 11305 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags); 11306 11307 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags); 11308 } 11309 11310 // Returns immediate value for setting the F32 denorm mode when using the 11311 // S_DENORM_MODE instruction. 11312 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, 11313 const SIMachineFunctionInfo *Info, 11314 const GCNSubtarget *ST) { 11315 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); 11316 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); 11317 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2); 11318 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32); 11319 } 11320 11321 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 11322 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 11323 return FastLowered; 11324 11325 // The selection matcher assumes anything with a chain selecting to a 11326 // mayRaiseFPException machine instruction. Since we're introducing a chain 11327 // here, we need to explicitly report nofpexcept for the regular fdiv 11328 // lowering. 11329 SDNodeFlags Flags = Op->getFlags(); 11330 Flags.setNoFPExcept(true); 11331 11332 SDLoc SL(Op); 11333 SDValue LHS = Op.getOperand(0); 11334 SDValue RHS = Op.getOperand(1); 11335 11336 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 11337 11338 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 11339 11340 SDValue DenominatorScaled = 11341 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags); 11342 SDValue NumeratorScaled = 11343 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags); 11344 11345 // Denominator is scaled to not be denormal, so using rcp is ok. 11346 SDValue ApproxRcp = 11347 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags); 11348 SDValue NegDivScale0 = 11349 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags); 11350 11351 using namespace AMDGPU::Hwreg; 11352 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2); 11353 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); 11354 11355 const MachineFunction &MF = DAG.getMachineFunction(); 11356 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 11357 const DenormalMode DenormMode = Info->getMode().FP32Denormals; 11358 11359 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); 11360 const bool HasDynamicDenormals = 11361 (DenormMode.Input == DenormalMode::Dynamic) || 11362 (DenormMode.Output == DenormalMode::Dynamic); 11363 11364 SDValue SavedDenormMode; 11365 11366 if (!PreservesDenormals) { 11367 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV 11368 // lowering. The chain dependence is insufficient, and we need glue. We do 11369 // not need the glue variants in a strictfp function. 11370 11371 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 11372 11373 SDValue Glue = DAG.getEntryNode(); 11374 if (HasDynamicDenormals) { 11375 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, 11376 DAG.getVTList(MVT::i32, MVT::Glue), 11377 {BitField, Glue}); 11378 SavedDenormMode = SDValue(GetReg, 0); 11379 11380 Glue = DAG.getMergeValues( 11381 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); 11382 } 11383 11384 SDNode *EnableDenorm; 11385 if (Subtarget->hasDenormModeInst()) { 11386 const SDValue EnableDenormValue = 11387 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); 11388 11389 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, 11390 EnableDenormValue) 11391 .getNode(); 11392 } else { 11393 const SDValue EnableDenormValue = 11394 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); 11395 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, 11396 {EnableDenormValue, BitField, Glue}); 11397 } 11398 11399 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0), 11400 SDValue(EnableDenorm, 1)}; 11401 11402 NegDivScale0 = DAG.getMergeValues(Ops, SL); 11403 } 11404 11405 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, 11406 ApproxRcp, One, NegDivScale0, Flags); 11407 11408 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, 11409 ApproxRcp, Fma0, Flags); 11410 11411 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1, 11412 Fma1, Flags); 11413 11414 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, 11415 NumeratorScaled, Mul, Flags); 11416 11417 SDValue Fma3 = 11418 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags); 11419 11420 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, 11421 NumeratorScaled, Fma3, Flags); 11422 11423 if (!PreservesDenormals) { 11424 SDNode *DisableDenorm; 11425 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { 11426 const SDValue DisableDenormValue = getSPDenormModeValue( 11427 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); 11428 11429 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 11430 DisableDenorm = 11431 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, 11432 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2)) 11433 .getNode(); 11434 } else { 11435 assert(HasDynamicDenormals == (bool)SavedDenormMode); 11436 const SDValue DisableDenormValue = 11437 HasDynamicDenormals 11438 ? SavedDenormMode 11439 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); 11440 11441 DisableDenorm = DAG.getMachineNode( 11442 AMDGPU::S_SETREG_B32, SL, MVT::Other, 11443 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)}); 11444 } 11445 11446 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 11447 SDValue(DisableDenorm, 0), DAG.getRoot()); 11448 DAG.setRoot(OutputChain); 11449 } 11450 11451 SDValue Scale = NumeratorScaled.getValue(1); 11452 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, 11453 {Fma4, Fma1, Fma3, Scale}, Flags); 11454 11455 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); 11456 } 11457 11458 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 11459 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) 11460 return FastLowered; 11461 11462 SDLoc SL(Op); 11463 SDValue X = Op.getOperand(0); 11464 SDValue Y = Op.getOperand(1); 11465 11466 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 11467 11468 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 11469 11470 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 11471 11472 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 11473 11474 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 11475 11476 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 11477 11478 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 11479 11480 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 11481 11482 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 11483 11484 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 11485 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 11486 11487 SDValue Fma4 = 11488 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1); 11489 11490 SDValue Scale; 11491 11492 if (!Subtarget->hasUsableDivScaleConditionOutput()) { 11493 // Workaround a hardware bug on SI where the condition output from div_scale 11494 // is not usable. 11495 11496 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 11497 11498 // Figure out if the scale to use for div_fmas. 11499 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 11500 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 11501 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 11502 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 11503 11504 SDValue NumHi = 11505 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 11506 SDValue DenHi = 11507 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 11508 11509 SDValue Scale0Hi = 11510 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 11511 SDValue Scale1Hi = 11512 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 11513 11514 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 11515 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 11516 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 11517 } else { 11518 Scale = DivScale1.getValue(1); 11519 } 11520 11521 SDValue Fmas = 11522 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale); 11523 11524 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 11525 } 11526 11527 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 11528 EVT VT = Op.getValueType(); 11529 11530 if (VT == MVT::f32) 11531 return LowerFDIV32(Op, DAG); 11532 11533 if (VT == MVT::f64) 11534 return LowerFDIV64(Op, DAG); 11535 11536 if (VT == MVT::f16) 11537 return LowerFDIV16(Op, DAG); 11538 11539 llvm_unreachable("Unexpected type for fdiv"); 11540 } 11541 11542 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { 11543 SDLoc dl(Op); 11544 SDValue Val = Op.getOperand(0); 11545 EVT VT = Val.getValueType(); 11546 EVT ResultExpVT = Op->getValueType(1); 11547 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32; 11548 11549 SDValue Mant = DAG.getNode( 11550 ISD::INTRINSIC_WO_CHAIN, dl, VT, 11551 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val); 11552 11553 SDValue Exp = DAG.getNode( 11554 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT, 11555 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val); 11556 11557 if (Subtarget->hasFractBug()) { 11558 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val); 11559 SDValue Inf = 11560 DAG.getConstantFP(APFloat::getInf(VT.getFltSemantics()), dl, VT); 11561 11562 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT); 11563 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT); 11564 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero); 11565 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val); 11566 } 11567 11568 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT); 11569 return DAG.getMergeValues({Mant, CastExp}, dl); 11570 } 11571 11572 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 11573 SDLoc DL(Op); 11574 StoreSDNode *Store = cast<StoreSDNode>(Op); 11575 EVT VT = Store->getMemoryVT(); 11576 11577 if (VT == MVT::i1) { 11578 return DAG.getTruncStore( 11579 Store->getChain(), DL, 11580 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 11581 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 11582 } 11583 11584 assert(VT.isVector() && 11585 Store->getValue().getValueType().getScalarType() == MVT::i32); 11586 11587 unsigned AS = Store->getAddressSpace(); 11588 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && 11589 Store->getAlign().value() < VT.getStoreSize() && 11590 VT.getSizeInBits() > 32) { 11591 return SplitVectorStore(Op, DAG); 11592 } 11593 11594 MachineFunction &MF = DAG.getMachineFunction(); 11595 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 11596 // If there is a possibility that flat instruction access scratch memory 11597 // then we need to use the same legalization rules we use for private. 11598 if (AS == AMDGPUAS::FLAT_ADDRESS && 11599 !Subtarget->hasMultiDwordFlatScratchAddressing()) 11600 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) 11601 ? AMDGPUAS::PRIVATE_ADDRESS 11602 : AMDGPUAS::GLOBAL_ADDRESS; 11603 11604 unsigned NumElements = VT.getVectorNumElements(); 11605 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { 11606 if (NumElements > 4) 11607 return SplitVectorStore(Op, DAG); 11608 // v3 stores not supported on SI. 11609 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 11610 return SplitVectorStore(Op, DAG); 11611 11612 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 11613 VT, *Store->getMemOperand())) 11614 return expandUnalignedStore(Store, DAG); 11615 11616 return SDValue(); 11617 } 11618 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 11619 switch (Subtarget->getMaxPrivateElementSize()) { 11620 case 4: 11621 return scalarizeVectorStore(Store, DAG); 11622 case 8: 11623 if (NumElements > 2) 11624 return SplitVectorStore(Op, DAG); 11625 return SDValue(); 11626 case 16: 11627 if (NumElements > 4 || 11628 (NumElements == 3 && !Subtarget->enableFlatScratch())) 11629 return SplitVectorStore(Op, DAG); 11630 return SDValue(); 11631 default: 11632 llvm_unreachable("unsupported private_element_size"); 11633 } 11634 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 11635 unsigned Fast = 0; 11636 auto Flags = Store->getMemOperand()->getFlags(); 11637 if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, 11638 Store->getAlign(), Flags, &Fast) && 11639 Fast > 1) 11640 return SDValue(); 11641 11642 if (VT.isVector()) 11643 return SplitVectorStore(Op, DAG); 11644 11645 return expandUnalignedStore(Store, DAG); 11646 } 11647 11648 // Probably an invalid store. If so we'll end up emitting a selection error. 11649 return SDValue(); 11650 } 11651 11652 // Avoid the full correct expansion for f32 sqrt when promoting from f16. 11653 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { 11654 SDLoc SL(Op); 11655 assert(!Subtarget->has16BitInsts()); 11656 SDNodeFlags Flags = Op->getFlags(); 11657 SDValue Ext = 11658 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); 11659 11660 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); 11661 SDValue Sqrt = 11662 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); 11663 11664 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, 11665 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 11666 } 11667 11668 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { 11669 SDLoc DL(Op); 11670 SDNodeFlags Flags = Op->getFlags(); 11671 MVT VT = Op.getValueType().getSimpleVT(); 11672 const SDValue X = Op.getOperand(0); 11673 11674 if (allowApproxFunc(DAG, Flags)) { 11675 // Instruction is 1ulp but ignores denormals. 11676 return DAG.getNode( 11677 ISD::INTRINSIC_WO_CHAIN, DL, VT, 11678 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); 11679 } 11680 11681 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); 11682 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); 11683 11684 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); 11685 11686 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); 11687 11688 SDValue SqrtX = 11689 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); 11690 11691 SDValue SqrtS; 11692 if (needsDenormHandlingF32(DAG, X, Flags)) { 11693 SDValue SqrtID = 11694 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); 11695 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); 11696 11697 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); 11698 SDValue SqrtSNextDownInt = 11699 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, 11700 DAG.getAllOnesConstant(DL, MVT::i32)); 11701 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); 11702 11703 SDValue NegSqrtSNextDown = 11704 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); 11705 11706 SDValue SqrtVP = 11707 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 11708 11709 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, 11710 DAG.getConstant(1, DL, MVT::i32)); 11711 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); 11712 11713 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); 11714 SDValue SqrtVS = 11715 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 11716 11717 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 11718 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); 11719 11720 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, 11721 Flags); 11722 11723 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); 11724 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, 11725 Flags); 11726 } else { 11727 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); 11728 11729 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); 11730 11731 SDValue Half = DAG.getConstantFP(0.5f, DL, VT); 11732 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); 11733 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); 11734 11735 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); 11736 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); 11737 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); 11738 11739 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); 11740 SDValue SqrtD = 11741 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); 11742 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); 11743 } 11744 11745 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); 11746 11747 SDValue ScaledDown = 11748 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); 11749 11750 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); 11751 SDValue IsZeroOrInf = 11752 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, 11753 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); 11754 11755 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); 11756 } 11757 11758 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { 11759 // For double type, the SQRT and RSQ instructions don't have required 11760 // precision, we apply Goldschmidt's algorithm to improve the result: 11761 // 11762 // y0 = rsq(x) 11763 // g0 = x * y0 11764 // h0 = 0.5 * y0 11765 // 11766 // r0 = 0.5 - h0 * g0 11767 // g1 = g0 * r0 + g0 11768 // h1 = h0 * r0 + h0 11769 // 11770 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 11771 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 11772 // h2 = h1 * r1 + h1 11773 // 11774 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 11775 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 11776 // 11777 // sqrt(x) = g3 11778 11779 SDNodeFlags Flags = Op->getFlags(); 11780 11781 SDLoc DL(Op); 11782 11783 SDValue X = Op.getOperand(0); 11784 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); 11785 11786 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); 11787 11788 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); 11789 11790 // Scale up input if it is too small. 11791 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); 11792 SDValue ScaleUp = 11793 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); 11794 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); 11795 11796 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); 11797 11798 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); 11799 11800 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); 11801 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); 11802 11803 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); 11804 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); 11805 11806 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); 11807 11808 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); 11809 11810 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); 11811 SDValue SqrtD0 = 11812 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); 11813 11814 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); 11815 11816 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); 11817 SDValue SqrtD1 = 11818 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); 11819 11820 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); 11821 11822 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32); 11823 SDValue ScaleDown = 11824 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); 11825 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); 11826 11827 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 11828 // with finite only or nsz because rsq(+/-0) = +/-inf 11829 11830 // TODO: Check for DAZ and expand to subnormals 11831 SDValue IsZeroOrInf = 11832 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, 11833 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); 11834 11835 // If x is +INF, +0, or -0, use its original value 11836 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, 11837 Flags); 11838 } 11839 11840 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 11841 SDLoc DL(Op); 11842 EVT VT = Op.getValueType(); 11843 SDValue Arg = Op.getOperand(0); 11844 SDValue TrigVal; 11845 11846 // Propagate fast-math flags so that the multiply we introduce can be folded 11847 // if Arg is already the result of a multiply by constant. 11848 auto Flags = Op->getFlags(); 11849 11850 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT); 11851 11852 if (Subtarget->hasTrigReducedRange()) { 11853 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); 11854 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); 11855 } else { 11856 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); 11857 } 11858 11859 switch (Op.getOpcode()) { 11860 case ISD::FCOS: 11861 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); 11862 case ISD::FSIN: 11863 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); 11864 default: 11865 llvm_unreachable("Wrong trig opcode"); 11866 } 11867 } 11868 11869 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, 11870 SelectionDAG &DAG) const { 11871 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 11872 assert(AtomicNode->isCompareAndSwap()); 11873 unsigned AS = AtomicNode->getAddressSpace(); 11874 11875 // No custom lowering required for local address space 11876 if (!AMDGPU::isFlatGlobalAddrSpace(AS)) 11877 return Op; 11878 11879 // Non-local address space requires custom lowering for atomic compare 11880 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 11881 SDLoc DL(Op); 11882 SDValue ChainIn = Op.getOperand(0); 11883 SDValue Addr = Op.getOperand(1); 11884 SDValue Old = Op.getOperand(2); 11885 SDValue New = Op.getOperand(3); 11886 EVT VT = Op.getValueType(); 11887 MVT SimpleVT = VT.getSimpleVT(); 11888 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 11889 11890 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 11891 SDValue Ops[] = {ChainIn, Addr, NewOld}; 11892 11893 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, 11894 Op->getVTList(), Ops, VT, 11895 AtomicNode->getMemOperand()); 11896 } 11897 11898 //===----------------------------------------------------------------------===// 11899 // Custom DAG optimizations 11900 //===----------------------------------------------------------------------===// 11901 11902 SDValue 11903 SITargetLowering::performUCharToFloatCombine(SDNode *N, 11904 DAGCombinerInfo &DCI) const { 11905 EVT VT = N->getValueType(0); 11906 EVT ScalarVT = VT.getScalarType(); 11907 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) 11908 return SDValue(); 11909 11910 SelectionDAG &DAG = DCI.DAG; 11911 SDLoc DL(N); 11912 11913 SDValue Src = N->getOperand(0); 11914 EVT SrcVT = Src.getValueType(); 11915 11916 // TODO: We could try to match extracting the higher bytes, which would be 11917 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 11918 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 11919 // about in practice. 11920 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { 11921 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 11922 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); 11923 DCI.AddToWorklist(Cvt.getNode()); 11924 11925 // For the f16 case, fold to a cast to f32 and then cast back to f16. 11926 if (ScalarVT != MVT::f32) { 11927 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, 11928 DAG.getTargetConstant(0, DL, MVT::i32)); 11929 } 11930 return Cvt; 11931 } 11932 } 11933 11934 return SDValue(); 11935 } 11936 11937 SDValue SITargetLowering::performFCopySignCombine(SDNode *N, 11938 DAGCombinerInfo &DCI) const { 11939 SDValue MagnitudeOp = N->getOperand(0); 11940 SDValue SignOp = N->getOperand(1); 11941 11942 // The generic combine for fcopysign + fp cast is too conservative with 11943 // vectors, and also gets confused by the splitting we will perform here, so 11944 // peek through FP casts. 11945 if (SignOp.getOpcode() == ISD::FP_EXTEND || 11946 SignOp.getOpcode() == ISD::FP_ROUND) 11947 SignOp = SignOp.getOperand(0); 11948 11949 SelectionDAG &DAG = DCI.DAG; 11950 SDLoc DL(N); 11951 EVT SignVT = SignOp.getValueType(); 11952 11953 // f64 fcopysign is really an f32 copysign on the high bits, so replace the 11954 // lower half with a copy. 11955 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) 11956 EVT MagVT = MagnitudeOp.getValueType(); 11957 11958 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; 11959 11960 if (MagVT.getScalarType() == MVT::f64) { 11961 EVT F32VT = MagVT.isVector() 11962 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) 11963 : MVT::v2f32; 11964 11965 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); 11966 11967 SmallVector<SDValue, 8> NewElts; 11968 for (unsigned I = 0; I != NumElts; ++I) { 11969 SDValue MagLo = 11970 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, 11971 DAG.getConstant(2 * I, DL, MVT::i32)); 11972 SDValue MagHi = 11973 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, 11974 DAG.getConstant(2 * I + 1, DL, MVT::i32)); 11975 11976 SDValue SignOpElt = 11977 MagVT.isVector() 11978 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), 11979 SignOp, DAG.getConstant(I, DL, MVT::i32)) 11980 : SignOp; 11981 11982 SDValue HiOp = 11983 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); 11984 11985 SDValue Vector = 11986 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); 11987 11988 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); 11989 NewElts.push_back(NewElt); 11990 } 11991 11992 if (NewElts.size() == 1) 11993 return NewElts[0]; 11994 11995 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); 11996 } 11997 11998 if (SignVT.getScalarType() != MVT::f64) 11999 return SDValue(); 12000 12001 // Reduce width of sign operand, we only need the highest bit. 12002 // 12003 // fcopysign f64:x, f64:y -> 12004 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) 12005 // TODO: In some cases it might make sense to go all the way to f16. 12006 12007 EVT F32VT = MagVT.isVector() 12008 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) 12009 : MVT::v2f32; 12010 12011 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); 12012 12013 SmallVector<SDValue, 8> F32Signs; 12014 for (unsigned I = 0; I != NumElts; ++I) { 12015 // Take sign from odd elements of cast vector 12016 SDValue SignAsF32 = 12017 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, 12018 DAG.getConstant(2 * I + 1, DL, MVT::i32)); 12019 F32Signs.push_back(SignAsF32); 12020 } 12021 12022 SDValue NewSign = 12023 NumElts == 1 12024 ? F32Signs.back() 12025 : DAG.getNode(ISD::BUILD_VECTOR, DL, 12026 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), 12027 F32Signs); 12028 12029 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), 12030 NewSign); 12031 } 12032 12033 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 12034 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no 12035 // bits 12036 12037 // This is a variant of 12038 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 12039 // 12040 // The normal DAG combiner will do this, but only if the add has one use since 12041 // that would increase the number of instructions. 12042 // 12043 // This prevents us from seeing a constant offset that can be folded into a 12044 // memory instruction's addressing mode. If we know the resulting add offset of 12045 // a pointer can be folded into an addressing offset, we can replace the pointer 12046 // operand with the add of new constant offset. This eliminates one of the uses, 12047 // and may allow the remaining use to also be simplified. 12048 // 12049 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace, 12050 EVT MemVT, 12051 DAGCombinerInfo &DCI) const { 12052 SDValue N0 = N->getOperand(0); 12053 SDValue N1 = N->getOperand(1); 12054 12055 // We only do this to handle cases where it's profitable when there are 12056 // multiple uses of the add, so defer to the standard combine. 12057 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || 12058 N0->hasOneUse()) 12059 return SDValue(); 12060 12061 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 12062 if (!CN1) 12063 return SDValue(); 12064 12065 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 12066 if (!CAdd) 12067 return SDValue(); 12068 12069 SelectionDAG &DAG = DCI.DAG; 12070 12071 if (N0->getOpcode() == ISD::OR && 12072 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) 12073 return SDValue(); 12074 12075 // If the resulting offset is too large, we can't fold it into the 12076 // addressing mode offset. 12077 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 12078 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); 12079 12080 AddrMode AM; 12081 AM.HasBaseReg = true; 12082 AM.BaseOffs = Offset.getSExtValue(); 12083 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) 12084 return SDValue(); 12085 12086 SDLoc SL(N); 12087 EVT VT = N->getValueType(0); 12088 12089 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 12090 SDValue COffset = DAG.getConstant(Offset, SL, VT); 12091 12092 SDNodeFlags Flags; 12093 Flags.setNoUnsignedWrap( 12094 N->getFlags().hasNoUnsignedWrap() && 12095 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap())); 12096 12097 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); 12098 } 12099 12100 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset 12101 /// by the chain and intrinsic ID. Theoretically we would also need to check the 12102 /// specific intrinsic, but they all place the pointer operand first. 12103 static unsigned getBasePtrIndex(const MemSDNode *N) { 12104 switch (N->getOpcode()) { 12105 case ISD::STORE: 12106 case ISD::INTRINSIC_W_CHAIN: 12107 case ISD::INTRINSIC_VOID: 12108 return 2; 12109 default: 12110 return 1; 12111 } 12112 } 12113 12114 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, 12115 DAGCombinerInfo &DCI) const { 12116 SelectionDAG &DAG = DCI.DAG; 12117 12118 unsigned PtrIdx = getBasePtrIndex(N); 12119 SDValue Ptr = N->getOperand(PtrIdx); 12120 12121 // TODO: We could also do this for multiplies. 12122 if (Ptr.getOpcode() == ISD::SHL) { 12123 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), 12124 N->getMemoryVT(), DCI); 12125 if (NewPtr) { 12126 SmallVector<SDValue, 8> NewOps(N->ops()); 12127 12128 NewOps[PtrIdx] = NewPtr; 12129 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); 12130 } 12131 } 12132 12133 return SDValue(); 12134 } 12135 12136 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 12137 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 12138 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 12139 (Opc == ISD::XOR && Val == 0); 12140 } 12141 12142 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 12143 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 12144 // integer combine opportunities since most 64-bit operations are decomposed 12145 // this way. TODO: We won't want this for SALU especially if it is an inline 12146 // immediate. 12147 SDValue SITargetLowering::splitBinaryBitConstantOp( 12148 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, 12149 const ConstantSDNode *CRHS) const { 12150 uint64_t Val = CRHS->getZExtValue(); 12151 uint32_t ValLo = Lo_32(Val); 12152 uint32_t ValHi = Hi_32(Val); 12153 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 12154 12155 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 12156 bitOpWithConstantIsReducible(Opc, ValHi)) || 12157 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 12158 // We have 64-bit scalar and/or/xor, but do not have vector forms. 12159 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() && 12160 !CRHS->user_begin()->isDivergent()) 12161 return SDValue(); 12162 12163 // If we need to materialize a 64-bit immediate, it will be split up later 12164 // anyway. Avoid creating the harder to understand 64-bit immediate 12165 // materialization. 12166 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 12167 } 12168 12169 return SDValue(); 12170 } 12171 12172 bool llvm::isBoolSGPR(SDValue V) { 12173 if (V.getValueType() != MVT::i1) 12174 return false; 12175 switch (V.getOpcode()) { 12176 default: 12177 break; 12178 case ISD::SETCC: 12179 case ISD::IS_FPCLASS: 12180 case AMDGPUISD::FP_CLASS: 12181 return true; 12182 case ISD::AND: 12183 case ISD::OR: 12184 case ISD::XOR: 12185 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1)); 12186 case ISD::SADDO: 12187 case ISD::UADDO: 12188 case ISD::SSUBO: 12189 case ISD::USUBO: 12190 case ISD::SMULO: 12191 case ISD::UMULO: 12192 return V.getResNo() == 1; 12193 case ISD::INTRINSIC_WO_CHAIN: { 12194 unsigned IntrinsicID = V.getConstantOperandVal(0); 12195 switch (IntrinsicID) { 12196 case Intrinsic::amdgcn_is_shared: 12197 case Intrinsic::amdgcn_is_private: 12198 return true; 12199 default: 12200 return false; 12201 } 12202 12203 return false; 12204 } 12205 } 12206 return false; 12207 } 12208 12209 // If a constant has all zeroes or all ones within each byte return it. 12210 // Otherwise return 0. 12211 static uint32_t getConstantPermuteMask(uint32_t C) { 12212 // 0xff for any zero byte in the mask 12213 uint32_t ZeroByteMask = 0; 12214 if (!(C & 0x000000ff)) 12215 ZeroByteMask |= 0x000000ff; 12216 if (!(C & 0x0000ff00)) 12217 ZeroByteMask |= 0x0000ff00; 12218 if (!(C & 0x00ff0000)) 12219 ZeroByteMask |= 0x00ff0000; 12220 if (!(C & 0xff000000)) 12221 ZeroByteMask |= 0xff000000; 12222 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte 12223 if ((NonZeroByteMask & C) != NonZeroByteMask) 12224 return 0; // Partial bytes selected. 12225 return C; 12226 } 12227 12228 // Check if a node selects whole bytes from its operand 0 starting at a byte 12229 // boundary while masking the rest. Returns select mask as in the v_perm_b32 12230 // or -1 if not succeeded. 12231 // Note byte select encoding: 12232 // value 0-3 selects corresponding source byte; 12233 // value 0xc selects zero; 12234 // value 0xff selects 0xff. 12235 static uint32_t getPermuteMask(SDValue V) { 12236 assert(V.getValueSizeInBits() == 32); 12237 12238 if (V.getNumOperands() != 2) 12239 return ~0; 12240 12241 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1)); 12242 if (!N1) 12243 return ~0; 12244 12245 uint32_t C = N1->getZExtValue(); 12246 12247 switch (V.getOpcode()) { 12248 default: 12249 break; 12250 case ISD::AND: 12251 if (uint32_t ConstMask = getConstantPermuteMask(C)) 12252 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); 12253 break; 12254 12255 case ISD::OR: 12256 if (uint32_t ConstMask = getConstantPermuteMask(C)) 12257 return (0x03020100 & ~ConstMask) | ConstMask; 12258 break; 12259 12260 case ISD::SHL: 12261 if (C % 8) 12262 return ~0; 12263 12264 return uint32_t((0x030201000c0c0c0cull << C) >> 32); 12265 12266 case ISD::SRL: 12267 if (C % 8) 12268 return ~0; 12269 12270 return uint32_t(0x0c0c0c0c03020100ull >> C); 12271 } 12272 12273 return ~0; 12274 } 12275 12276 SDValue SITargetLowering::performAndCombine(SDNode *N, 12277 DAGCombinerInfo &DCI) const { 12278 if (DCI.isBeforeLegalize()) 12279 return SDValue(); 12280 12281 SelectionDAG &DAG = DCI.DAG; 12282 EVT VT = N->getValueType(0); 12283 SDValue LHS = N->getOperand(0); 12284 SDValue RHS = N->getOperand(1); 12285 12286 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 12287 if (VT == MVT::i64 && CRHS) { 12288 if (SDValue Split = 12289 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 12290 return Split; 12291 } 12292 12293 if (CRHS && VT == MVT::i32) { 12294 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb 12295 // nb = number of trailing zeroes in mask 12296 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, 12297 // given that we are selecting 8 or 16 bit fields starting at byte boundary. 12298 uint64_t Mask = CRHS->getZExtValue(); 12299 unsigned Bits = llvm::popcount(Mask); 12300 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && 12301 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { 12302 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { 12303 unsigned Shift = CShift->getZExtValue(); 12304 unsigned NB = CRHS->getAPIntValue().countr_zero(); 12305 unsigned Offset = NB + Shift; 12306 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. 12307 SDLoc SL(N); 12308 SDValue BFE = 12309 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0), 12310 DAG.getConstant(Offset, SL, MVT::i32), 12311 DAG.getConstant(Bits, SL, MVT::i32)); 12312 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); 12313 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, 12314 DAG.getValueType(NarrowVT)); 12315 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, 12316 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); 12317 return Shl; 12318 } 12319 } 12320 } 12321 12322 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) 12323 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && 12324 isa<ConstantSDNode>(LHS.getOperand(2))) { 12325 uint32_t Sel = getConstantPermuteMask(Mask); 12326 if (!Sel) 12327 return SDValue(); 12328 12329 // Select 0xc for all zero bytes 12330 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c); 12331 SDLoc DL(N); 12332 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 12333 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); 12334 } 12335 } 12336 12337 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 12338 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 12339 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 12340 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 12341 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 12342 12343 SDValue X = LHS.getOperand(0); 12344 SDValue Y = RHS.getOperand(0); 12345 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X || 12346 !isTypeLegal(X.getValueType())) 12347 return SDValue(); 12348 12349 if (LCC == ISD::SETO) { 12350 if (X != LHS.getOperand(1)) 12351 return SDValue(); 12352 12353 if (RCC == ISD::SETUNE) { 12354 const ConstantFPSDNode *C1 = 12355 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 12356 if (!C1 || !C1->isInfinity() || C1->isNegative()) 12357 return SDValue(); 12358 12359 const uint32_t Mask = SIInstrFlags::N_NORMAL | 12360 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO | 12361 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL | 12362 SIInstrFlags::P_NORMAL; 12363 12364 static_assert( 12365 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN | 12366 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) & 12367 0x3ff) == Mask, 12368 "mask not equal"); 12369 12370 SDLoc DL(N); 12371 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X, 12372 DAG.getConstant(Mask, DL, MVT::i32)); 12373 } 12374 } 12375 } 12376 12377 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS) 12378 std::swap(LHS, RHS); 12379 12380 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && 12381 RHS.hasOneUse()) { 12382 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 12383 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | 12384 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan 12385 // | n_nan) 12386 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 12387 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && 12388 (RHS.getOperand(0) == LHS.getOperand(0) && 12389 LHS.getOperand(0) == LHS.getOperand(1))) { 12390 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; 12391 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask 12392 : Mask->getZExtValue() & OrdMask; 12393 12394 SDLoc DL(N); 12395 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0), 12396 DAG.getConstant(NewMask, DL, MVT::i32)); 12397 } 12398 } 12399 12400 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND || 12401 LHS.getOpcode() == ISD::SIGN_EXTEND)) { 12402 // and x, (sext cc from i1) => select cc, x, 0 12403 if (RHS.getOpcode() != ISD::SIGN_EXTEND) 12404 std::swap(LHS, RHS); 12405 if (isBoolSGPR(RHS.getOperand(0))) 12406 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS, 12407 DAG.getConstant(0, SDLoc(N), MVT::i32)); 12408 } 12409 12410 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) 12411 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 12412 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && 12413 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 12414 uint32_t LHSMask = getPermuteMask(LHS); 12415 uint32_t RHSMask = getPermuteMask(RHS); 12416 if (LHSMask != ~0u && RHSMask != ~0u) { 12417 // Canonicalize the expression in an attempt to have fewer unique masks 12418 // and therefore fewer registers used to hold the masks. 12419 if (LHSMask > RHSMask) { 12420 std::swap(LHSMask, RHSMask); 12421 std::swap(LHS, RHS); 12422 } 12423 12424 // Select 0xc for each lane used from source operand. Zero has 0xc mask 12425 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. 12426 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 12427 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 12428 12429 // Check of we need to combine values from two sources within a byte. 12430 if (!(LHSUsedLanes & RHSUsedLanes) && 12431 // If we select high and lower word keep it for SDWA. 12432 // TODO: teach SDWA to work with v_perm_b32 and remove the check. 12433 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { 12434 // Each byte in each mask is either selector mask 0-3, or has higher 12435 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for 12436 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise 12437 // mask which is not 0xff wins. By anding both masks we have a correct 12438 // result except that 0x0c shall be corrected to give 0x0c only. 12439 uint32_t Mask = LHSMask & RHSMask; 12440 for (unsigned I = 0; I < 32; I += 8) { 12441 uint32_t ByteSel = 0xff << I; 12442 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) 12443 Mask &= (0x0c << I) & 0xffffffff; 12444 } 12445 12446 // Add 4 to each active LHS lane. It will not affect any existing 0xff 12447 // or 0x0c. 12448 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); 12449 SDLoc DL(N); 12450 12451 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 12452 RHS.getOperand(0), 12453 DAG.getConstant(Sel, DL, MVT::i32)); 12454 } 12455 } 12456 } 12457 12458 return SDValue(); 12459 } 12460 12461 // A key component of v_perm is a mapping between byte position of the src 12462 // operands, and the byte position of the dest. To provide such, we need: 1. the 12463 // node that provides x byte of the dest of the OR, and 2. the byte of the node 12464 // used to provide that x byte. calculateByteProvider finds which node provides 12465 // a certain byte of the dest of the OR, and calculateSrcByte takes that node, 12466 // and finds an ultimate src and byte position For example: The supported 12467 // LoadCombine pattern for vector loads is as follows 12468 // t1 12469 // or 12470 // / \ 12471 // t2 t3 12472 // zext shl 12473 // | | \ 12474 // t4 t5 16 12475 // or anyext 12476 // / \ | 12477 // t6 t7 t8 12478 // srl shl or 12479 // / | / \ / \ 12480 // t9 t10 t11 t12 t13 t14 12481 // trunc* 8 trunc* 8 and and 12482 // | | / | | \ 12483 // t15 t16 t17 t18 t19 t20 12484 // trunc* 255 srl -256 12485 // | / \ 12486 // t15 t15 16 12487 // 12488 // *In this example, the truncs are from i32->i16 12489 // 12490 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 12491 // respectively. calculateSrcByte would find (given node) -> ultimate src & 12492 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. 12493 // After finding the mapping, we can combine the tree into vperm t15, t16, 12494 // 0x05000407 12495 12496 // Find the source and byte position from a node. 12497 // \p DestByte is the byte position of the dest of the or that the src 12498 // ultimately provides. \p SrcIndex is the byte of the src that maps to this 12499 // dest of the or byte. \p Depth tracks how many recursive iterations we have 12500 // performed. 12501 static const std::optional<ByteProvider<SDValue>> 12502 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, 12503 unsigned Depth = 0) { 12504 // We may need to recursively traverse a series of SRLs 12505 if (Depth >= 6) 12506 return std::nullopt; 12507 12508 if (Op.getValueSizeInBits() < 8) 12509 return std::nullopt; 12510 12511 if (Op.getValueType().isVector()) 12512 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); 12513 12514 switch (Op->getOpcode()) { 12515 case ISD::TRUNCATE: { 12516 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 12517 } 12518 12519 case ISD::SIGN_EXTEND: 12520 case ISD::ZERO_EXTEND: 12521 case ISD::SIGN_EXTEND_INREG: { 12522 SDValue NarrowOp = Op->getOperand(0); 12523 auto NarrowVT = NarrowOp.getValueType(); 12524 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { 12525 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); 12526 NarrowVT = VTSign->getVT(); 12527 } 12528 if (!NarrowVT.isByteSized()) 12529 return std::nullopt; 12530 uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); 12531 12532 if (SrcIndex >= NarrowByteWidth) 12533 return std::nullopt; 12534 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 12535 } 12536 12537 case ISD::SRA: 12538 case ISD::SRL: { 12539 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12540 if (!ShiftOp) 12541 return std::nullopt; 12542 12543 uint64_t BitShift = ShiftOp->getZExtValue(); 12544 12545 if (BitShift % 8 != 0) 12546 return std::nullopt; 12547 12548 SrcIndex += BitShift / 8; 12549 12550 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 12551 } 12552 12553 default: { 12554 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); 12555 } 12556 } 12557 llvm_unreachable("fully handled switch"); 12558 } 12559 12560 // For a byte position in the result of an Or, traverse the tree and find the 12561 // node (and the byte of the node) which ultimately provides this {Or, 12562 // BytePosition}. \p Op is the operand we are currently examining. \p Index is 12563 // the byte position of the Op that corresponds with the originally requested 12564 // byte of the Or \p Depth tracks how many recursive iterations we have 12565 // performed. \p StartingIndex is the originally requested byte of the Or 12566 static const std::optional<ByteProvider<SDValue>> 12567 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, 12568 unsigned StartingIndex = 0) { 12569 // Finding Src tree of RHS of or typically requires at least 1 additional 12570 // depth 12571 if (Depth > 6) 12572 return std::nullopt; 12573 12574 unsigned BitWidth = Op.getScalarValueSizeInBits(); 12575 if (BitWidth % 8 != 0) 12576 return std::nullopt; 12577 if (Index > BitWidth / 8 - 1) 12578 return std::nullopt; 12579 12580 bool IsVec = Op.getValueType().isVector(); 12581 switch (Op.getOpcode()) { 12582 case ISD::OR: { 12583 if (IsVec) 12584 return std::nullopt; 12585 12586 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1, 12587 StartingIndex); 12588 if (!RHS) 12589 return std::nullopt; 12590 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1, 12591 StartingIndex); 12592 if (!LHS) 12593 return std::nullopt; 12594 // A well formed Or will have two ByteProviders for each byte, one of which 12595 // is constant zero 12596 if (!LHS->isConstantZero() && !RHS->isConstantZero()) 12597 return std::nullopt; 12598 if (!LHS || LHS->isConstantZero()) 12599 return RHS; 12600 if (!RHS || RHS->isConstantZero()) 12601 return LHS; 12602 return std::nullopt; 12603 } 12604 12605 case ISD::AND: { 12606 if (IsVec) 12607 return std::nullopt; 12608 12609 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12610 if (!BitMaskOp) 12611 return std::nullopt; 12612 12613 uint32_t BitMask = BitMaskOp->getZExtValue(); 12614 // Bits we expect for our StartingIndex 12615 uint32_t IndexMask = 0xFF << (Index * 8); 12616 12617 if ((IndexMask & BitMask) != IndexMask) { 12618 // If the result of the and partially provides the byte, then it 12619 // is not well formatted 12620 if (IndexMask & BitMask) 12621 return std::nullopt; 12622 return ByteProvider<SDValue>::getConstantZero(); 12623 } 12624 12625 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); 12626 } 12627 12628 case ISD::FSHR: { 12629 if (IsVec) 12630 return std::nullopt; 12631 12632 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 12633 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); 12634 if (!ShiftOp || Op.getValueType().isVector()) 12635 return std::nullopt; 12636 12637 uint64_t BitsProvided = Op.getValueSizeInBits(); 12638 if (BitsProvided % 8 != 0) 12639 return std::nullopt; 12640 12641 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); 12642 if (BitShift % 8) 12643 return std::nullopt; 12644 12645 uint64_t ConcatSizeInBytes = BitsProvided / 4; 12646 uint64_t ByteShift = BitShift / 8; 12647 12648 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; 12649 uint64_t BytesProvided = BitsProvided / 8; 12650 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); 12651 NewIndex %= BytesProvided; 12652 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); 12653 } 12654 12655 case ISD::SRA: 12656 case ISD::SRL: { 12657 if (IsVec) 12658 return std::nullopt; 12659 12660 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12661 if (!ShiftOp) 12662 return std::nullopt; 12663 12664 uint64_t BitShift = ShiftOp->getZExtValue(); 12665 if (BitShift % 8) 12666 return std::nullopt; 12667 12668 auto BitsProvided = Op.getScalarValueSizeInBits(); 12669 if (BitsProvided % 8 != 0) 12670 return std::nullopt; 12671 12672 uint64_t BytesProvided = BitsProvided / 8; 12673 uint64_t ByteShift = BitShift / 8; 12674 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. 12675 // If the byte we are trying to provide (as tracked by index) falls in this 12676 // range, then the SRL provides the byte. The byte of interest of the src of 12677 // the SRL is Index + ByteShift 12678 return BytesProvided - ByteShift > Index 12679 ? calculateSrcByte(Op->getOperand(0), StartingIndex, 12680 Index + ByteShift) 12681 : ByteProvider<SDValue>::getConstantZero(); 12682 } 12683 12684 case ISD::SHL: { 12685 if (IsVec) 12686 return std::nullopt; 12687 12688 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12689 if (!ShiftOp) 12690 return std::nullopt; 12691 12692 uint64_t BitShift = ShiftOp->getZExtValue(); 12693 if (BitShift % 8 != 0) 12694 return std::nullopt; 12695 uint64_t ByteShift = BitShift / 8; 12696 12697 // If we are shifting by an amount greater than (or equal to) 12698 // the index we are trying to provide, then it provides 0s. If not, 12699 // then this bytes are not definitively 0s, and the corresponding byte 12700 // of interest is Index - ByteShift of the src 12701 return Index < ByteShift 12702 ? ByteProvider<SDValue>::getConstantZero() 12703 : calculateByteProvider(Op.getOperand(0), Index - ByteShift, 12704 Depth + 1, StartingIndex); 12705 } 12706 case ISD::ANY_EXTEND: 12707 case ISD::SIGN_EXTEND: 12708 case ISD::ZERO_EXTEND: 12709 case ISD::SIGN_EXTEND_INREG: 12710 case ISD::AssertZext: 12711 case ISD::AssertSext: { 12712 if (IsVec) 12713 return std::nullopt; 12714 12715 SDValue NarrowOp = Op->getOperand(0); 12716 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); 12717 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || 12718 Op->getOpcode() == ISD::AssertZext || 12719 Op->getOpcode() == ISD::AssertSext) { 12720 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); 12721 NarrowBitWidth = VTSign->getVT().getSizeInBits(); 12722 } 12723 if (NarrowBitWidth % 8 != 0) 12724 return std::nullopt; 12725 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 12726 12727 if (Index >= NarrowByteWidth) 12728 return Op.getOpcode() == ISD::ZERO_EXTEND 12729 ? std::optional<ByteProvider<SDValue>>( 12730 ByteProvider<SDValue>::getConstantZero()) 12731 : std::nullopt; 12732 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); 12733 } 12734 12735 case ISD::TRUNCATE: { 12736 if (IsVec) 12737 return std::nullopt; 12738 12739 uint64_t NarrowByteWidth = BitWidth / 8; 12740 12741 if (NarrowByteWidth >= Index) { 12742 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, 12743 StartingIndex); 12744 } 12745 12746 return std::nullopt; 12747 } 12748 12749 case ISD::CopyFromReg: { 12750 if (BitWidth / 8 > Index) 12751 return calculateSrcByte(Op, StartingIndex, Index); 12752 12753 return std::nullopt; 12754 } 12755 12756 case ISD::LOAD: { 12757 auto *L = cast<LoadSDNode>(Op.getNode()); 12758 12759 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); 12760 if (NarrowBitWidth % 8 != 0) 12761 return std::nullopt; 12762 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 12763 12764 // If the width of the load does not reach byte we are trying to provide for 12765 // and it is not a ZEXTLOAD, then the load does not provide for the byte in 12766 // question 12767 if (Index >= NarrowByteWidth) { 12768 return L->getExtensionType() == ISD::ZEXTLOAD 12769 ? std::optional<ByteProvider<SDValue>>( 12770 ByteProvider<SDValue>::getConstantZero()) 12771 : std::nullopt; 12772 } 12773 12774 if (NarrowByteWidth > Index) { 12775 return calculateSrcByte(Op, StartingIndex, Index); 12776 } 12777 12778 return std::nullopt; 12779 } 12780 12781 case ISD::BSWAP: { 12782 if (IsVec) 12783 return std::nullopt; 12784 12785 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, 12786 Depth + 1, StartingIndex); 12787 } 12788 12789 case ISD::EXTRACT_VECTOR_ELT: { 12790 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12791 if (!IdxOp) 12792 return std::nullopt; 12793 auto VecIdx = IdxOp->getZExtValue(); 12794 auto ScalarSize = Op.getScalarValueSizeInBits(); 12795 if (ScalarSize < 32) 12796 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; 12797 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0), 12798 StartingIndex, Index); 12799 } 12800 12801 case AMDGPUISD::PERM: { 12802 if (IsVec) 12803 return std::nullopt; 12804 12805 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); 12806 if (!PermMask) 12807 return std::nullopt; 12808 12809 auto IdxMask = 12810 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); 12811 if (IdxMask > 0x07 && IdxMask != 0x0c) 12812 return std::nullopt; 12813 12814 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1); 12815 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; 12816 12817 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex) 12818 : ByteProvider<SDValue>( 12819 ByteProvider<SDValue>::getConstantZero()); 12820 } 12821 12822 default: { 12823 return std::nullopt; 12824 } 12825 } 12826 12827 llvm_unreachable("fully handled switch"); 12828 } 12829 12830 // Returns true if the Operand is a scalar and is 16 bits 12831 static bool isExtendedFrom16Bits(SDValue &Operand) { 12832 12833 switch (Operand.getOpcode()) { 12834 case ISD::ANY_EXTEND: 12835 case ISD::SIGN_EXTEND: 12836 case ISD::ZERO_EXTEND: { 12837 auto OpVT = Operand.getOperand(0).getValueType(); 12838 return !OpVT.isVector() && OpVT.getSizeInBits() == 16; 12839 } 12840 case ISD::LOAD: { 12841 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode()); 12842 auto ExtType = cast<LoadSDNode>(L)->getExtensionType(); 12843 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || 12844 ExtType == ISD::EXTLOAD) { 12845 auto MemVT = L->getMemoryVT(); 12846 return !MemVT.isVector() && MemVT.getSizeInBits() == 16; 12847 } 12848 return L->getMemoryVT().getSizeInBits() == 16; 12849 } 12850 default: 12851 return false; 12852 } 12853 } 12854 12855 // Returns true if the mask matches consecutive bytes, and the first byte 12856 // begins at a power of 2 byte offset from 0th byte 12857 static bool addresses16Bits(int Mask) { 12858 int Low8 = Mask & 0xff; 12859 int Hi8 = (Mask & 0xff00) >> 8; 12860 12861 assert(Low8 < 8 && Hi8 < 8); 12862 // Are the bytes contiguous in the order of increasing addresses. 12863 bool IsConsecutive = (Hi8 - Low8 == 1); 12864 // Is the first byte at location that is aligned for 16 bit instructions. 12865 // A counter example is taking 2 consecutive bytes starting at the 8th bit. 12866 // In this case, we still need code to extract the 16 bit operand, so it 12867 // is better to use i8 v_perm 12868 bool Is16Aligned = !(Low8 % 2); 12869 12870 return IsConsecutive && Is16Aligned; 12871 } 12872 12873 // Do not lower into v_perm if the operands are actually 16 bit 12874 // and the selected bits (based on PermMask) correspond with two 12875 // easily addressable 16 bit operands. 12876 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, 12877 SDValue &OtherOp) { 12878 int Low16 = PermMask & 0xffff; 12879 int Hi16 = (PermMask & 0xffff0000) >> 16; 12880 12881 auto TempOp = peekThroughBitcasts(Op); 12882 auto TempOtherOp = peekThroughBitcasts(OtherOp); 12883 12884 auto OpIs16Bit = 12885 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); 12886 if (!OpIs16Bit) 12887 return true; 12888 12889 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || 12890 isExtendedFrom16Bits(TempOtherOp); 12891 if (!OtherOpIs16Bit) 12892 return true; 12893 12894 // Do we cleanly address both 12895 return !addresses16Bits(Low16) || !addresses16Bits(Hi16); 12896 } 12897 12898 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, 12899 unsigned DWordOffset) { 12900 SDValue Ret; 12901 12902 auto TypeSize = Src.getValueSizeInBits().getFixedValue(); 12903 // ByteProvider must be at least 8 bits 12904 assert(Src.getValueSizeInBits().isKnownMultipleOf(8)); 12905 12906 if (TypeSize <= 32) 12907 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32); 12908 12909 if (Src.getValueType().isVector()) { 12910 auto ScalarTySize = Src.getScalarValueSizeInBits(); 12911 auto ScalarTy = Src.getValueType().getScalarType(); 12912 if (ScalarTySize == 32) { 12913 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src, 12914 DAG.getConstant(DWordOffset, SL, MVT::i32)); 12915 } 12916 if (ScalarTySize > 32) { 12917 Ret = DAG.getNode( 12918 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src, 12919 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32)); 12920 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32)); 12921 if (ShiftVal) 12922 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret, 12923 DAG.getConstant(ShiftVal, SL, MVT::i32)); 12924 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12925 } 12926 12927 assert(ScalarTySize < 32); 12928 auto NumElements = TypeSize / ScalarTySize; 12929 auto Trunc32Elements = (ScalarTySize * NumElements) / 32; 12930 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize; 12931 auto NumElementsIn32 = 32 / ScalarTySize; 12932 auto NumAvailElements = DWordOffset < Trunc32Elements 12933 ? NumElementsIn32 12934 : NumElements - NormalizedTrunc; 12935 12936 SmallVector<SDValue, 4> VecSrcs; 12937 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32, 12938 NumAvailElements); 12939 12940 Ret = DAG.getBuildVector( 12941 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL, 12942 VecSrcs); 12943 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12944 } 12945 12946 /// Scalar Type 12947 auto ShiftVal = 32 * DWordOffset; 12948 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src, 12949 DAG.getConstant(ShiftVal, SL, MVT::i32)); 12950 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12951 } 12952 12953 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12954 SelectionDAG &DAG = DCI.DAG; 12955 [[maybe_unused]] EVT VT = N->getValueType(0); 12956 SmallVector<ByteProvider<SDValue>, 8> PermNodes; 12957 12958 // VT is known to be MVT::i32, so we need to provide 4 bytes. 12959 assert(VT == MVT::i32); 12960 for (int i = 0; i < 4; i++) { 12961 // Find the ByteProvider that provides the ith byte of the result of OR 12962 std::optional<ByteProvider<SDValue>> P = 12963 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); 12964 // TODO support constantZero 12965 if (!P || P->isConstantZero()) 12966 return SDValue(); 12967 12968 PermNodes.push_back(*P); 12969 } 12970 if (PermNodes.size() != 4) 12971 return SDValue(); 12972 12973 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4); 12974 std::optional<std::pair<unsigned, unsigned>> SecondSrc; 12975 uint64_t PermMask = 0x00000000; 12976 for (size_t i = 0; i < PermNodes.size(); i++) { 12977 auto PermOp = PermNodes[i]; 12978 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset 12979 // by sizeof(Src2) = 4 12980 int SrcByteAdjust = 4; 12981 12982 // If the Src uses a byte from a different DWORD, then it corresponds 12983 // with a difference source 12984 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) || 12985 ((PermOp.SrcOffset / 4) != FirstSrc.second)) { 12986 if (SecondSrc) 12987 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) || 12988 ((PermOp.SrcOffset / 4) != SecondSrc->second)) 12989 return SDValue(); 12990 12991 // Set the index of the second distinct Src node 12992 SecondSrc = {i, PermNodes[i].SrcOffset / 4}; 12993 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); 12994 SrcByteAdjust = 0; 12995 } 12996 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8); 12997 assert(!DAG.getDataLayout().isBigEndian()); 12998 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8); 12999 } 13000 SDLoc DL(N); 13001 SDValue Op = *PermNodes[FirstSrc.first].Src; 13002 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second); 13003 assert(Op.getValueSizeInBits() == 32); 13004 13005 // Check that we are not just extracting the bytes in order from an op 13006 if (!SecondSrc) { 13007 int Low16 = PermMask & 0xffff; 13008 int Hi16 = (PermMask & 0xffff0000) >> 16; 13009 13010 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); 13011 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); 13012 13013 // The perm op would really just produce Op. So combine into Op 13014 if (WellFormedLow && WellFormedHi) 13015 return DAG.getBitcast(MVT::getIntegerVT(32), Op); 13016 } 13017 13018 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op; 13019 13020 if (SecondSrc) { 13021 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second); 13022 assert(OtherOp.getValueSizeInBits() == 32); 13023 } 13024 13025 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { 13026 13027 assert(Op.getValueType().isByteSized() && 13028 OtherOp.getValueType().isByteSized()); 13029 13030 // If the ultimate src is less than 32 bits, then we will only be 13031 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. 13032 // CalculateByteProvider would not have returned Op as source if we 13033 // used a byte that is outside its ValueType. Thus, we are free to 13034 // ANY_EXTEND as the extended bits are dont-cares. 13035 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); 13036 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); 13037 13038 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, 13039 DAG.getConstant(PermMask, DL, MVT::i32)); 13040 } 13041 return SDValue(); 13042 } 13043 13044 SDValue SITargetLowering::performOrCombine(SDNode *N, 13045 DAGCombinerInfo &DCI) const { 13046 SelectionDAG &DAG = DCI.DAG; 13047 SDValue LHS = N->getOperand(0); 13048 SDValue RHS = N->getOperand(1); 13049 13050 EVT VT = N->getValueType(0); 13051 if (VT == MVT::i1) { 13052 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 13053 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 13054 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 13055 SDValue Src = LHS.getOperand(0); 13056 if (Src != RHS.getOperand(0)) 13057 return SDValue(); 13058 13059 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 13060 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 13061 if (!CLHS || !CRHS) 13062 return SDValue(); 13063 13064 // Only 10 bits are used. 13065 static const uint32_t MaxMask = 0x3ff; 13066 13067 uint32_t NewMask = 13068 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 13069 SDLoc DL(N); 13070 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src, 13071 DAG.getConstant(NewMask, DL, MVT::i32)); 13072 } 13073 13074 return SDValue(); 13075 } 13076 13077 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) 13078 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() && 13079 LHS.getOpcode() == AMDGPUISD::PERM && 13080 isa<ConstantSDNode>(LHS.getOperand(2))) { 13081 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); 13082 if (!Sel) 13083 return SDValue(); 13084 13085 Sel |= LHS.getConstantOperandVal(2); 13086 SDLoc DL(N); 13087 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 13088 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); 13089 } 13090 13091 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) 13092 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 13093 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && 13094 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 13095 13096 // If all the uses of an or need to extract the individual elements, do not 13097 // attempt to lower into v_perm 13098 auto usesCombinedOperand = [](SDNode *OrUse) { 13099 // If we have any non-vectorized use, then it is a candidate for v_perm 13100 if (OrUse->getOpcode() != ISD::BITCAST || 13101 !OrUse->getValueType(0).isVector()) 13102 return true; 13103 13104 // If we have any non-vectorized use, then it is a candidate for v_perm 13105 for (auto *VUser : OrUse->users()) { 13106 if (!VUser->getValueType(0).isVector()) 13107 return true; 13108 13109 // If the use of a vector is a store, then combining via a v_perm 13110 // is beneficial. 13111 // TODO -- whitelist more uses 13112 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) 13113 if (VUser->getOpcode() == VectorwiseOp) 13114 return true; 13115 } 13116 return false; 13117 }; 13118 13119 if (!any_of(N->users(), usesCombinedOperand)) 13120 return SDValue(); 13121 13122 uint32_t LHSMask = getPermuteMask(LHS); 13123 uint32_t RHSMask = getPermuteMask(RHS); 13124 13125 if (LHSMask != ~0u && RHSMask != ~0u) { 13126 // Canonicalize the expression in an attempt to have fewer unique masks 13127 // and therefore fewer registers used to hold the masks. 13128 if (LHSMask > RHSMask) { 13129 std::swap(LHSMask, RHSMask); 13130 std::swap(LHS, RHS); 13131 } 13132 13133 // Select 0xc for each lane used from source operand. Zero has 0xc mask 13134 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. 13135 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 13136 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 13137 13138 // Check of we need to combine values from two sources within a byte. 13139 if (!(LHSUsedLanes & RHSUsedLanes) && 13140 // If we select high and lower word keep it for SDWA. 13141 // TODO: teach SDWA to work with v_perm_b32 and remove the check. 13142 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { 13143 // Kill zero bytes selected by other mask. Zero value is 0xc. 13144 LHSMask &= ~RHSUsedLanes; 13145 RHSMask &= ~LHSUsedLanes; 13146 // Add 4 to each active LHS lane 13147 LHSMask |= LHSUsedLanes & 0x04040404; 13148 // Combine masks 13149 uint32_t Sel = LHSMask | RHSMask; 13150 SDLoc DL(N); 13151 13152 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 13153 RHS.getOperand(0), 13154 DAG.getConstant(Sel, DL, MVT::i32)); 13155 } 13156 } 13157 if (LHSMask == ~0u || RHSMask == ~0u) { 13158 if (SDValue Perm = matchPERM(N, DCI)) 13159 return Perm; 13160 } 13161 } 13162 13163 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) 13164 return SDValue(); 13165 13166 // TODO: This could be a generic combine with a predicate for extracting the 13167 // high half of an integer being free. 13168 13169 // (or i64:x, (zero_extend i32:y)) -> 13170 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 13171 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 13172 RHS.getOpcode() != ISD::ZERO_EXTEND) 13173 std::swap(LHS, RHS); 13174 13175 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 13176 SDValue ExtSrc = RHS.getOperand(0); 13177 EVT SrcVT = ExtSrc.getValueType(); 13178 if (SrcVT == MVT::i32) { 13179 SDLoc SL(N); 13180 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG); 13181 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 13182 13183 DCI.AddToWorklist(LowOr.getNode()); 13184 DCI.AddToWorklist(HiBits.getNode()); 13185 13186 SDValue Vec = 13187 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits); 13188 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 13189 } 13190 } 13191 13192 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13193 if (CRHS) { 13194 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, 13195 N->getOperand(0), CRHS)) 13196 return Split; 13197 } 13198 13199 return SDValue(); 13200 } 13201 13202 SDValue SITargetLowering::performXorCombine(SDNode *N, 13203 DAGCombinerInfo &DCI) const { 13204 if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) 13205 return RV; 13206 13207 SDValue LHS = N->getOperand(0); 13208 SDValue RHS = N->getOperand(1); 13209 13210 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 13211 SelectionDAG &DAG = DCI.DAG; 13212 13213 EVT VT = N->getValueType(0); 13214 if (CRHS && VT == MVT::i64) { 13215 if (SDValue Split = 13216 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 13217 return Split; 13218 } 13219 13220 // Make sure to apply the 64-bit constant splitting fold before trying to fold 13221 // fneg-like xors into 64-bit select. 13222 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { 13223 // This looks like an fneg, try to fold as a source modifier. 13224 if (CRHS && CRHS->getAPIntValue().isSignMask() && 13225 shouldFoldFNegIntoSrc(N, LHS)) { 13226 // xor (select c, a, b), 0x80000000 -> 13227 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b))) 13228 SDLoc DL(N); 13229 SDValue CastLHS = 13230 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1)); 13231 SDValue CastRHS = 13232 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2)); 13233 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS); 13234 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS); 13235 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32, 13236 LHS->getOperand(0), FNegLHS, FNegRHS); 13237 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect); 13238 } 13239 } 13240 13241 return SDValue(); 13242 } 13243 13244 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, 13245 DAGCombinerInfo &DCI) const { 13246 if (!Subtarget->has16BitInsts() || 13247 DCI.getDAGCombineLevel() < AfterLegalizeDAG) 13248 return SDValue(); 13249 13250 EVT VT = N->getValueType(0); 13251 if (VT != MVT::i32) 13252 return SDValue(); 13253 13254 SDValue Src = N->getOperand(0); 13255 if (Src.getValueType() != MVT::i16) 13256 return SDValue(); 13257 13258 return SDValue(); 13259 } 13260 13261 SDValue 13262 SITargetLowering::performSignExtendInRegCombine(SDNode *N, 13263 DAGCombinerInfo &DCI) const { 13264 SDValue Src = N->getOperand(0); 13265 auto *VTSign = cast<VTSDNode>(N->getOperand(1)); 13266 13267 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them 13268 // with s_buffer_load_i8 and s_buffer_load_i16 respectively. 13269 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE && 13270 VTSign->getVT() == MVT::i8) || 13271 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT && 13272 VTSign->getVT() == MVT::i16))) { 13273 assert(Subtarget->hasScalarSubwordLoads() && 13274 "s_buffer_load_{u8, i8} are supported " 13275 "in GFX12 (or newer) architectures."); 13276 EVT VT = Src.getValueType(); 13277 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE) 13278 ? AMDGPUISD::SBUFFER_LOAD_BYTE 13279 : AMDGPUISD::SBUFFER_LOAD_SHORT; 13280 SDLoc DL(N); 13281 SDVTList ResList = DCI.DAG.getVTList(MVT::i32); 13282 SDValue Ops[] = { 13283 Src.getOperand(0), // source register 13284 Src.getOperand(1), // offset 13285 Src.getOperand(2) // cachePolicy 13286 }; 13287 auto *M = cast<MemSDNode>(Src); 13288 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode( 13289 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand()); 13290 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 13291 return LoadVal; 13292 } 13293 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && 13294 VTSign->getVT() == MVT::i8) || 13295 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && 13296 VTSign->getVT() == MVT::i16)) && 13297 Src.hasOneUse()) { 13298 auto *M = cast<MemSDNode>(Src); 13299 SDValue Ops[] = {Src.getOperand(0), // Chain 13300 Src.getOperand(1), // rsrc 13301 Src.getOperand(2), // vindex 13302 Src.getOperand(3), // voffset 13303 Src.getOperand(4), // soffset 13304 Src.getOperand(5), // offset 13305 Src.getOperand(6), Src.getOperand(7)}; 13306 // replace with BUFFER_LOAD_BYTE/SHORT 13307 SDVTList ResList = 13308 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType()); 13309 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) 13310 ? AMDGPUISD::BUFFER_LOAD_BYTE 13311 : AMDGPUISD::BUFFER_LOAD_SHORT; 13312 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode( 13313 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand()); 13314 return DCI.DAG.getMergeValues( 13315 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N)); 13316 } 13317 return SDValue(); 13318 } 13319 13320 SDValue SITargetLowering::performClassCombine(SDNode *N, 13321 DAGCombinerInfo &DCI) const { 13322 SelectionDAG &DAG = DCI.DAG; 13323 SDValue Mask = N->getOperand(1); 13324 13325 // fp_class x, 0 -> false 13326 if (isNullConstant(Mask)) 13327 return DAG.getConstant(0, SDLoc(N), MVT::i1); 13328 13329 if (N->getOperand(0).isUndef()) 13330 return DAG.getUNDEF(MVT::i1); 13331 13332 return SDValue(); 13333 } 13334 13335 SDValue SITargetLowering::performRcpCombine(SDNode *N, 13336 DAGCombinerInfo &DCI) const { 13337 EVT VT = N->getValueType(0); 13338 SDValue N0 = N->getOperand(0); 13339 13340 if (N0.isUndef()) { 13341 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()), 13342 SDLoc(N), VT); 13343 } 13344 13345 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || 13346 N0.getOpcode() == ISD::SINT_TO_FP)) { 13347 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, 13348 N->getFlags()); 13349 } 13350 13351 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. 13352 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && 13353 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { 13354 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0), 13355 N->getFlags()); 13356 } 13357 13358 return AMDGPUTargetLowering::performRcpCombine(N, DCI); 13359 } 13360 13361 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, 13362 unsigned MaxDepth) const { 13363 unsigned Opcode = Op.getOpcode(); 13364 if (Opcode == ISD::FCANONICALIZE) 13365 return true; 13366 13367 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 13368 const auto &F = CFP->getValueAPF(); 13369 if (F.isNaN() && F.isSignaling()) 13370 return false; 13371 if (!F.isDenormal()) 13372 return true; 13373 13374 DenormalMode Mode = 13375 DAG.getMachineFunction().getDenormalMode(F.getSemantics()); 13376 return Mode == DenormalMode::getIEEE(); 13377 } 13378 13379 // If source is a result of another standard FP operation it is already in 13380 // canonical form. 13381 if (MaxDepth == 0) 13382 return false; 13383 13384 switch (Opcode) { 13385 // These will flush denorms if required. 13386 case ISD::FADD: 13387 case ISD::FSUB: 13388 case ISD::FMUL: 13389 case ISD::FCEIL: 13390 case ISD::FFLOOR: 13391 case ISD::FMA: 13392 case ISD::FMAD: 13393 case ISD::FSQRT: 13394 case ISD::FDIV: 13395 case ISD::FREM: 13396 case ISD::FP_ROUND: 13397 case ISD::FP_EXTEND: 13398 case ISD::FP16_TO_FP: 13399 case ISD::FP_TO_FP16: 13400 case ISD::BF16_TO_FP: 13401 case ISD::FP_TO_BF16: 13402 case ISD::FLDEXP: 13403 case AMDGPUISD::FMUL_LEGACY: 13404 case AMDGPUISD::FMAD_FTZ: 13405 case AMDGPUISD::RCP: 13406 case AMDGPUISD::RSQ: 13407 case AMDGPUISD::RSQ_CLAMP: 13408 case AMDGPUISD::RCP_LEGACY: 13409 case AMDGPUISD::RCP_IFLAG: 13410 case AMDGPUISD::LOG: 13411 case AMDGPUISD::EXP: 13412 case AMDGPUISD::DIV_SCALE: 13413 case AMDGPUISD::DIV_FMAS: 13414 case AMDGPUISD::DIV_FIXUP: 13415 case AMDGPUISD::FRACT: 13416 case AMDGPUISD::CVT_PKRTZ_F16_F32: 13417 case AMDGPUISD::CVT_F32_UBYTE0: 13418 case AMDGPUISD::CVT_F32_UBYTE1: 13419 case AMDGPUISD::CVT_F32_UBYTE2: 13420 case AMDGPUISD::CVT_F32_UBYTE3: 13421 case AMDGPUISD::FP_TO_FP16: 13422 case AMDGPUISD::SIN_HW: 13423 case AMDGPUISD::COS_HW: 13424 return true; 13425 13426 // It can/will be lowered or combined as a bit operation. 13427 // Need to check their input recursively to handle. 13428 case ISD::FNEG: 13429 case ISD::FABS: 13430 case ISD::FCOPYSIGN: 13431 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 13432 13433 case ISD::AND: 13434 if (Op.getValueType() == MVT::i32) { 13435 // Be careful as we only know it is a bitcast floating point type. It 13436 // could be f32, v2f16, we have no way of knowing. Luckily the constant 13437 // value that we optimize for, which comes up in fp32 to bf16 conversions, 13438 // is valid to optimize for all types. 13439 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13440 if (RHS->getZExtValue() == 0xffff0000) { 13441 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 13442 } 13443 } 13444 } 13445 break; 13446 13447 case ISD::FSIN: 13448 case ISD::FCOS: 13449 case ISD::FSINCOS: 13450 return Op.getValueType().getScalarType() != MVT::f16; 13451 13452 case ISD::FMINNUM: 13453 case ISD::FMAXNUM: 13454 case ISD::FMINNUM_IEEE: 13455 case ISD::FMAXNUM_IEEE: 13456 case ISD::FMINIMUM: 13457 case ISD::FMAXIMUM: 13458 case ISD::FMINIMUMNUM: 13459 case ISD::FMAXIMUMNUM: 13460 case AMDGPUISD::CLAMP: 13461 case AMDGPUISD::FMED3: 13462 case AMDGPUISD::FMAX3: 13463 case AMDGPUISD::FMIN3: 13464 case AMDGPUISD::FMAXIMUM3: 13465 case AMDGPUISD::FMINIMUM3: { 13466 // FIXME: Shouldn't treat the generic operations different based these. 13467 // However, we aren't really required to flush the result from 13468 // minnum/maxnum.. 13469 13470 // snans will be quieted, so we only need to worry about denormals. 13471 if (Subtarget->supportsMinMaxDenormModes() || 13472 // FIXME: denormalsEnabledForType is broken for dynamic 13473 denormalsEnabledForType(DAG, Op.getValueType())) 13474 return true; 13475 13476 // Flushing may be required. 13477 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such 13478 // targets need to check their input recursively. 13479 13480 // FIXME: Does this apply with clamp? It's implemented with max. 13481 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { 13482 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) 13483 return false; 13484 } 13485 13486 return true; 13487 } 13488 case ISD::SELECT: { 13489 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && 13490 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1); 13491 } 13492 case ISD::BUILD_VECTOR: { 13493 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { 13494 SDValue SrcOp = Op.getOperand(i); 13495 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1)) 13496 return false; 13497 } 13498 13499 return true; 13500 } 13501 case ISD::EXTRACT_VECTOR_ELT: 13502 case ISD::EXTRACT_SUBVECTOR: { 13503 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 13504 } 13505 case ISD::INSERT_VECTOR_ELT: { 13506 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && 13507 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); 13508 } 13509 case ISD::UNDEF: 13510 // Could be anything. 13511 return false; 13512 13513 case ISD::BITCAST: 13514 // TODO: This is incorrect as it loses track of the operand's type. We may 13515 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the 13516 // same bits that are canonicalized in one type need not be in the other. 13517 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 13518 case ISD::TRUNCATE: { 13519 // Hack round the mess we make when legalizing extract_vector_elt 13520 if (Op.getValueType() == MVT::i16) { 13521 SDValue TruncSrc = Op.getOperand(0); 13522 if (TruncSrc.getValueType() == MVT::i32 && 13523 TruncSrc.getOpcode() == ISD::BITCAST && 13524 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { 13525 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); 13526 } 13527 } 13528 return false; 13529 } 13530 case ISD::INTRINSIC_WO_CHAIN: { 13531 unsigned IntrinsicID = Op.getConstantOperandVal(0); 13532 // TODO: Handle more intrinsics 13533 switch (IntrinsicID) { 13534 case Intrinsic::amdgcn_cvt_pkrtz: 13535 case Intrinsic::amdgcn_cubeid: 13536 case Intrinsic::amdgcn_frexp_mant: 13537 case Intrinsic::amdgcn_fdot2: 13538 case Intrinsic::amdgcn_rcp: 13539 case Intrinsic::amdgcn_rsq: 13540 case Intrinsic::amdgcn_rsq_clamp: 13541 case Intrinsic::amdgcn_rcp_legacy: 13542 case Intrinsic::amdgcn_rsq_legacy: 13543 case Intrinsic::amdgcn_trig_preop: 13544 case Intrinsic::amdgcn_log: 13545 case Intrinsic::amdgcn_exp2: 13546 case Intrinsic::amdgcn_sqrt: 13547 return true; 13548 default: 13549 break; 13550 } 13551 13552 break; 13553 } 13554 default: 13555 break; 13556 } 13557 13558 // FIXME: denormalsEnabledForType is broken for dynamic 13559 return denormalsEnabledForType(DAG, Op.getValueType()) && 13560 DAG.isKnownNeverSNaN(Op); 13561 } 13562 13563 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, 13564 unsigned MaxDepth) const { 13565 const MachineRegisterInfo &MRI = MF.getRegInfo(); 13566 MachineInstr *MI = MRI.getVRegDef(Reg); 13567 unsigned Opcode = MI->getOpcode(); 13568 13569 if (Opcode == AMDGPU::G_FCANONICALIZE) 13570 return true; 13571 13572 std::optional<FPValueAndVReg> FCR; 13573 // Constant splat (can be padded with undef) or scalar constant. 13574 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) { 13575 if (FCR->Value.isSignaling()) 13576 return false; 13577 if (!FCR->Value.isDenormal()) 13578 return true; 13579 13580 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics()); 13581 return Mode == DenormalMode::getIEEE(); 13582 } 13583 13584 if (MaxDepth == 0) 13585 return false; 13586 13587 switch (Opcode) { 13588 case AMDGPU::G_FADD: 13589 case AMDGPU::G_FSUB: 13590 case AMDGPU::G_FMUL: 13591 case AMDGPU::G_FCEIL: 13592 case AMDGPU::G_FFLOOR: 13593 case AMDGPU::G_FRINT: 13594 case AMDGPU::G_FNEARBYINT: 13595 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: 13596 case AMDGPU::G_INTRINSIC_TRUNC: 13597 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 13598 case AMDGPU::G_FMA: 13599 case AMDGPU::G_FMAD: 13600 case AMDGPU::G_FSQRT: 13601 case AMDGPU::G_FDIV: 13602 case AMDGPU::G_FREM: 13603 case AMDGPU::G_FPOW: 13604 case AMDGPU::G_FPEXT: 13605 case AMDGPU::G_FLOG: 13606 case AMDGPU::G_FLOG2: 13607 case AMDGPU::G_FLOG10: 13608 case AMDGPU::G_FPTRUNC: 13609 case AMDGPU::G_AMDGPU_RCP_IFLAG: 13610 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 13611 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 13612 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 13613 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 13614 return true; 13615 case AMDGPU::G_FNEG: 13616 case AMDGPU::G_FABS: 13617 case AMDGPU::G_FCOPYSIGN: 13618 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1); 13619 case AMDGPU::G_FMINNUM: 13620 case AMDGPU::G_FMAXNUM: 13621 case AMDGPU::G_FMINNUM_IEEE: 13622 case AMDGPU::G_FMAXNUM_IEEE: 13623 case AMDGPU::G_FMINIMUM: 13624 case AMDGPU::G_FMAXIMUM: 13625 case AMDGPU::G_FMINIMUMNUM: 13626 case AMDGPU::G_FMAXIMUMNUM: { 13627 if (Subtarget->supportsMinMaxDenormModes() || 13628 // FIXME: denormalsEnabledForType is broken for dynamic 13629 denormalsEnabledForType(MRI.getType(Reg), MF)) 13630 return true; 13631 13632 [[fallthrough]]; 13633 } 13634 case AMDGPU::G_BUILD_VECTOR: 13635 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) 13636 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1)) 13637 return false; 13638 return true; 13639 case AMDGPU::G_INTRINSIC: 13640 case AMDGPU::G_INTRINSIC_CONVERGENT: 13641 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { 13642 case Intrinsic::amdgcn_fmul_legacy: 13643 case Intrinsic::amdgcn_fmad_ftz: 13644 case Intrinsic::amdgcn_sqrt: 13645 case Intrinsic::amdgcn_fmed3: 13646 case Intrinsic::amdgcn_sin: 13647 case Intrinsic::amdgcn_cos: 13648 case Intrinsic::amdgcn_log: 13649 case Intrinsic::amdgcn_exp2: 13650 case Intrinsic::amdgcn_log_clamp: 13651 case Intrinsic::amdgcn_rcp: 13652 case Intrinsic::amdgcn_rcp_legacy: 13653 case Intrinsic::amdgcn_rsq: 13654 case Intrinsic::amdgcn_rsq_clamp: 13655 case Intrinsic::amdgcn_rsq_legacy: 13656 case Intrinsic::amdgcn_div_scale: 13657 case Intrinsic::amdgcn_div_fmas: 13658 case Intrinsic::amdgcn_div_fixup: 13659 case Intrinsic::amdgcn_fract: 13660 case Intrinsic::amdgcn_cvt_pkrtz: 13661 case Intrinsic::amdgcn_cubeid: 13662 case Intrinsic::amdgcn_cubema: 13663 case Intrinsic::amdgcn_cubesc: 13664 case Intrinsic::amdgcn_cubetc: 13665 case Intrinsic::amdgcn_frexp_mant: 13666 case Intrinsic::amdgcn_fdot2: 13667 case Intrinsic::amdgcn_trig_preop: 13668 case Intrinsic::amdgcn_tanh: 13669 return true; 13670 default: 13671 break; 13672 } 13673 13674 [[fallthrough]]; 13675 default: 13676 return false; 13677 } 13678 13679 llvm_unreachable("invalid operation"); 13680 } 13681 13682 // Constant fold canonicalize. 13683 SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG, 13684 const SDLoc &SL, EVT VT, 13685 const APFloat &C) const { 13686 // Flush denormals to 0 if not enabled. 13687 if (C.isDenormal()) { 13688 DenormalMode Mode = 13689 DAG.getMachineFunction().getDenormalMode(C.getSemantics()); 13690 if (Mode == DenormalMode::getPreserveSign()) { 13691 return DAG.getConstantFP( 13692 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT); 13693 } 13694 13695 if (Mode != DenormalMode::getIEEE()) 13696 return SDValue(); 13697 } 13698 13699 if (C.isNaN()) { 13700 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 13701 if (C.isSignaling()) { 13702 // Quiet a signaling NaN. 13703 // FIXME: Is this supposed to preserve payload bits? 13704 return DAG.getConstantFP(CanonicalQNaN, SL, VT); 13705 } 13706 13707 // Make sure it is the canonical NaN bitpattern. 13708 // 13709 // TODO: Can we use -1 as the canonical NaN value since it's an inline 13710 // immediate? 13711 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 13712 return DAG.getConstantFP(CanonicalQNaN, SL, VT); 13713 } 13714 13715 // Already canonical. 13716 return DAG.getConstantFP(C, SL, VT); 13717 } 13718 13719 static bool vectorEltWillFoldAway(SDValue Op) { 13720 return Op.isUndef() || isa<ConstantFPSDNode>(Op); 13721 } 13722 13723 SDValue 13724 SITargetLowering::performFCanonicalizeCombine(SDNode *N, 13725 DAGCombinerInfo &DCI) const { 13726 SelectionDAG &DAG = DCI.DAG; 13727 SDValue N0 = N->getOperand(0); 13728 EVT VT = N->getValueType(0); 13729 13730 // fcanonicalize undef -> qnan 13731 if (N0.isUndef()) { 13732 APFloat QNaN = APFloat::getQNaN(VT.getFltSemantics()); 13733 return DAG.getConstantFP(QNaN, SDLoc(N), VT); 13734 } 13735 13736 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) { 13737 EVT VT = N->getValueType(0); 13738 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF()); 13739 } 13740 13741 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), 13742 // (fcanonicalize k) 13743 // 13744 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 13745 13746 // TODO: This could be better with wider vectors that will be split to v2f16, 13747 // and to consider uses since there aren't that many packed operations. 13748 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && 13749 isTypeLegal(MVT::v2f16)) { 13750 SDLoc SL(N); 13751 SDValue NewElts[2]; 13752 SDValue Lo = N0.getOperand(0); 13753 SDValue Hi = N0.getOperand(1); 13754 EVT EltVT = Lo.getValueType(); 13755 13756 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) { 13757 for (unsigned I = 0; I != 2; ++I) { 13758 SDValue Op = N0.getOperand(I); 13759 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 13760 NewElts[I] = 13761 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF()); 13762 } else if (Op.isUndef()) { 13763 // Handled below based on what the other operand is. 13764 NewElts[I] = Op; 13765 } else { 13766 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op); 13767 } 13768 } 13769 13770 // If one half is undef, and one is constant, prefer a splat vector rather 13771 // than the normal qNaN. If it's a register, prefer 0.0 since that's 13772 // cheaper to use and may be free with a packed operation. 13773 if (NewElts[0].isUndef()) { 13774 if (isa<ConstantFPSDNode>(NewElts[1])) 13775 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) 13776 ? NewElts[1] 13777 : DAG.getConstantFP(0.0f, SL, EltVT); 13778 } 13779 13780 if (NewElts[1].isUndef()) { 13781 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) 13782 ? NewElts[0] 13783 : DAG.getConstantFP(0.0f, SL, EltVT); 13784 } 13785 13786 return DAG.getBuildVector(VT, SL, NewElts); 13787 } 13788 } 13789 13790 return SDValue(); 13791 } 13792 13793 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 13794 switch (Opc) { 13795 case ISD::FMAXNUM: 13796 case ISD::FMAXNUM_IEEE: 13797 case ISD::FMAXIMUMNUM: 13798 return AMDGPUISD::FMAX3; 13799 case ISD::FMAXIMUM: 13800 return AMDGPUISD::FMAXIMUM3; 13801 case ISD::SMAX: 13802 return AMDGPUISD::SMAX3; 13803 case ISD::UMAX: 13804 return AMDGPUISD::UMAX3; 13805 case ISD::FMINNUM: 13806 case ISD::FMINNUM_IEEE: 13807 case ISD::FMINIMUMNUM: 13808 return AMDGPUISD::FMIN3; 13809 case ISD::FMINIMUM: 13810 return AMDGPUISD::FMINIMUM3; 13811 case ISD::SMIN: 13812 return AMDGPUISD::SMIN3; 13813 case ISD::UMIN: 13814 return AMDGPUISD::UMIN3; 13815 default: 13816 llvm_unreachable("Not a min/max opcode"); 13817 } 13818 } 13819 13820 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, 13821 const SDLoc &SL, SDValue Src, 13822 SDValue MinVal, 13823 SDValue MaxVal, 13824 bool Signed) const { 13825 13826 // med3 comes from 13827 // min(max(x, K0), K1), K0 < K1 13828 // max(min(x, K0), K1), K1 < K0 13829 // 13830 // "MinVal" and "MaxVal" respectively refer to the rhs of the 13831 // min/max op. 13832 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal); 13833 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal); 13834 13835 if (!MinK || !MaxK) 13836 return SDValue(); 13837 13838 if (Signed) { 13839 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) 13840 return SDValue(); 13841 } else { 13842 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) 13843 return SDValue(); 13844 } 13845 13846 EVT VT = MinK->getValueType(0); 13847 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; 13848 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) 13849 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal); 13850 13851 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is 13852 // not available, but this is unlikely to be profitable as constants 13853 // will often need to be materialized & extended, especially on 13854 // pre-GFX10 where VOP3 instructions couldn't take literal operands. 13855 return SDValue(); 13856 } 13857 13858 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { 13859 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) 13860 return C; 13861 13862 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) { 13863 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) 13864 return C; 13865 } 13866 13867 return nullptr; 13868 } 13869 13870 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, 13871 const SDLoc &SL, SDValue Op0, 13872 SDValue Op1) const { 13873 ConstantFPSDNode *K1 = getSplatConstantFP(Op1); 13874 if (!K1) 13875 return SDValue(); 13876 13877 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1)); 13878 if (!K0) 13879 return SDValue(); 13880 13881 // Ordered >= (although NaN inputs should have folded away by now). 13882 if (K0->getValueAPF() > K1->getValueAPF()) 13883 return SDValue(); 13884 13885 // med3 with a nan input acts like 13886 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32) 13887 // 13888 // So the result depends on whether the IEEE mode bit is enabled or not with a 13889 // signaling nan input. 13890 // ieee=1 13891 // s0 snan: yields s2 13892 // s1 snan: yields s2 13893 // s2 snan: qnan 13894 13895 // s0 qnan: min(s1, s2) 13896 // s1 qnan: min(s0, s2) 13897 // s2 qnan: min(s0, s1) 13898 13899 // ieee=0 13900 // s0 snan: min(s1, s2) 13901 // s1 snan: min(s0, s2) 13902 // s2 snan: qnan 13903 13904 // s0 qnan: min(s1, s2) 13905 // s1 qnan: min(s0, s2) 13906 // s2 qnan: min(s0, s1) 13907 const MachineFunction &MF = DAG.getMachineFunction(); 13908 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 13909 13910 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of 13911 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We 13912 // can only form if op0 is fmaxnum_ieee if IEEE=1. 13913 EVT VT = Op0.getValueType(); 13914 if (Info->getMode().DX10Clamp) { 13915 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the 13916 // hardware fmed3 behavior converting to a min. 13917 // FIXME: Should this be allowing -0.0? 13918 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) 13919 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); 13920 } 13921 13922 // med3 for f16 is only available on gfx9+, and not available for v2f16. 13923 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { 13924 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 13925 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would 13926 // then give the other result, which is different from med3 with a NaN 13927 // input. 13928 SDValue Var = Op0.getOperand(0); 13929 if (!DAG.isKnownNeverSNaN(Var)) 13930 return SDValue(); 13931 13932 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 13933 13934 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) && 13935 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) { 13936 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var, 13937 SDValue(K0, 0), SDValue(K1, 0)); 13938 } 13939 } 13940 13941 return SDValue(); 13942 } 13943 13944 /// \return true if the subtarget supports minimum3 and maximum3 with the given 13945 /// base min/max opcode \p Opc for type \p VT. 13946 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, 13947 EVT VT) { 13948 switch (Opc) { 13949 case ISD::FMINNUM: 13950 case ISD::FMAXNUM: 13951 case ISD::FMINNUM_IEEE: 13952 case ISD::FMAXNUM_IEEE: 13953 case ISD::FMINIMUMNUM: 13954 case ISD::FMAXIMUMNUM: 13955 case AMDGPUISD::FMIN_LEGACY: 13956 case AMDGPUISD::FMAX_LEGACY: 13957 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); 13958 case ISD::FMINIMUM: 13959 case ISD::FMAXIMUM: 13960 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || 13961 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) || 13962 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16()); 13963 case ISD::SMAX: 13964 case ISD::SMIN: 13965 case ISD::UMAX: 13966 case ISD::UMIN: 13967 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16()); 13968 default: 13969 return false; 13970 } 13971 13972 llvm_unreachable("not a min/max opcode"); 13973 } 13974 13975 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 13976 DAGCombinerInfo &DCI) const { 13977 SelectionDAG &DAG = DCI.DAG; 13978 13979 EVT VT = N->getValueType(0); 13980 unsigned Opc = N->getOpcode(); 13981 SDValue Op0 = N->getOperand(0); 13982 SDValue Op1 = N->getOperand(1); 13983 13984 // Only do this if the inner op has one use since this will just increases 13985 // register pressure for no benefit. 13986 13987 if (supportsMin3Max3(*Subtarget, Opc, VT)) { 13988 // max(max(a, b), c) -> max3(a, b, c) 13989 // min(min(a, b), c) -> min3(a, b, c) 13990 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 13991 SDLoc DL(N); 13992 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0), 13993 Op0.getOperand(0), Op0.getOperand(1), Op1); 13994 } 13995 13996 // Try commuted. 13997 // max(a, max(b, c)) -> max3(a, b, c) 13998 // min(a, min(b, c)) -> min3(a, b, c) 13999 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 14000 SDLoc DL(N); 14001 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0), 14002 Op0, Op1.getOperand(0), Op1.getOperand(1)); 14003 } 14004 } 14005 14006 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 14007 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) 14008 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 14009 if (SDValue Med3 = performIntMed3ImmCombine( 14010 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) 14011 return Med3; 14012 } 14013 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { 14014 if (SDValue Med3 = performIntMed3ImmCombine( 14015 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) 14016 return Med3; 14017 } 14018 14019 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 14020 if (SDValue Med3 = performIntMed3ImmCombine( 14021 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) 14022 return Med3; 14023 } 14024 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { 14025 if (SDValue Med3 = performIntMed3ImmCombine( 14026 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) 14027 return Med3; 14028 } 14029 14030 // if !is_snan(x): 14031 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) 14032 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) 14033 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) 14034 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) 14035 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 14036 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || 14037 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) || 14038 (Opc == AMDGPUISD::FMIN_LEGACY && 14039 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 14040 (VT == MVT::f32 || VT == MVT::f64 || 14041 (VT == MVT::f16 && Subtarget->has16BitInsts()) || 14042 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && 14043 Op0.hasOneUse()) { 14044 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 14045 return Res; 14046 } 14047 14048 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal 14049 // for some types, but at a higher cost since it's implemented with a 3 14050 // operand form. 14051 const SDNodeFlags Flags = N->getFlags(); 14052 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && 14053 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) { 14054 unsigned NewOpc = 14055 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; 14056 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags); 14057 } 14058 14059 return SDValue(); 14060 } 14061 14062 static bool isClampZeroToOne(SDValue A, SDValue B) { 14063 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { 14064 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { 14065 // FIXME: Should this be allowing -0.0? 14066 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || 14067 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); 14068 } 14069 } 14070 14071 return false; 14072 } 14073 14074 // FIXME: Should only worry about snans for version with chain. 14075 SDValue SITargetLowering::performFMed3Combine(SDNode *N, 14076 DAGCombinerInfo &DCI) const { 14077 EVT VT = N->getValueType(0); 14078 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and 14079 // NaNs. With a NaN input, the order of the operands may change the result. 14080 14081 SelectionDAG &DAG = DCI.DAG; 14082 SDLoc SL(N); 14083 14084 SDValue Src0 = N->getOperand(0); 14085 SDValue Src1 = N->getOperand(1); 14086 SDValue Src2 = N->getOperand(2); 14087 14088 if (isClampZeroToOne(Src0, Src1)) { 14089 // const_a, const_b, x -> clamp is safe in all cases including signaling 14090 // nans. 14091 // FIXME: Should this be allowing -0.0? 14092 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); 14093 } 14094 14095 const MachineFunction &MF = DAG.getMachineFunction(); 14096 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 14097 14098 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother 14099 // handling no dx10-clamp? 14100 if (Info->getMode().DX10Clamp) { 14101 // If NaNs is clamped to 0, we are free to reorder the inputs. 14102 14103 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 14104 std::swap(Src0, Src1); 14105 14106 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) 14107 std::swap(Src1, Src2); 14108 14109 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 14110 std::swap(Src0, Src1); 14111 14112 if (isClampZeroToOne(Src1, Src2)) 14113 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); 14114 } 14115 14116 return SDValue(); 14117 } 14118 14119 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, 14120 DAGCombinerInfo &DCI) const { 14121 SDValue Src0 = N->getOperand(0); 14122 SDValue Src1 = N->getOperand(1); 14123 if (Src0.isUndef() && Src1.isUndef()) 14124 return DCI.DAG.getUNDEF(N->getValueType(0)); 14125 return SDValue(); 14126 } 14127 14128 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be 14129 // expanded into a set of cmp/select instructions. 14130 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, 14131 unsigned NumElem, 14132 bool IsDivergentIdx, 14133 const GCNSubtarget *Subtarget) { 14134 if (UseDivergentRegisterIndexing) 14135 return false; 14136 14137 unsigned VecSize = EltSize * NumElem; 14138 14139 // Sub-dword vectors of size 2 dword or less have better implementation. 14140 if (VecSize <= 64 && EltSize < 32) 14141 return false; 14142 14143 // Always expand the rest of sub-dword instructions, otherwise it will be 14144 // lowered via memory. 14145 if (EltSize < 32) 14146 return true; 14147 14148 // Always do this if var-idx is divergent, otherwise it will become a loop. 14149 if (IsDivergentIdx) 14150 return true; 14151 14152 // Large vectors would yield too many compares and v_cndmask_b32 instructions. 14153 unsigned NumInsts = NumElem /* Number of compares */ + 14154 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; 14155 14156 // On some architectures (GFX9) movrel is not available and it's better 14157 // to expand. 14158 if (Subtarget->useVGPRIndexMode()) 14159 return NumInsts <= 16; 14160 14161 // If movrel is available, use it instead of expanding for vector of 8 14162 // elements. 14163 if (Subtarget->hasMovrel()) 14164 return NumInsts <= 15; 14165 14166 return true; 14167 } 14168 14169 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { 14170 SDValue Idx = N->getOperand(N->getNumOperands() - 1); 14171 if (isa<ConstantSDNode>(Idx)) 14172 return false; 14173 14174 SDValue Vec = N->getOperand(0); 14175 EVT VecVT = Vec.getValueType(); 14176 EVT EltVT = VecVT.getVectorElementType(); 14177 unsigned EltSize = EltVT.getSizeInBits(); 14178 unsigned NumElem = VecVT.getVectorNumElements(); 14179 14180 return SITargetLowering::shouldExpandVectorDynExt( 14181 EltSize, NumElem, Idx->isDivergent(), getSubtarget()); 14182 } 14183 14184 SDValue 14185 SITargetLowering::performExtractVectorEltCombine(SDNode *N, 14186 DAGCombinerInfo &DCI) const { 14187 SDValue Vec = N->getOperand(0); 14188 SelectionDAG &DAG = DCI.DAG; 14189 14190 EVT VecVT = Vec.getValueType(); 14191 EVT VecEltVT = VecVT.getVectorElementType(); 14192 EVT ResVT = N->getValueType(0); 14193 14194 unsigned VecSize = VecVT.getSizeInBits(); 14195 unsigned VecEltSize = VecEltVT.getSizeInBits(); 14196 14197 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && 14198 allUsesHaveSourceMods(N)) { 14199 SDLoc SL(N); 14200 SDValue Idx = N->getOperand(1); 14201 SDValue Elt = 14202 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx); 14203 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); 14204 } 14205 14206 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) 14207 // => 14208 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) 14209 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) 14210 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt 14211 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { 14212 SDLoc SL(N); 14213 SDValue Idx = N->getOperand(1); 14214 unsigned Opc = Vec.getOpcode(); 14215 14216 switch (Opc) { 14217 default: 14218 break; 14219 // TODO: Support other binary operations. 14220 case ISD::FADD: 14221 case ISD::FSUB: 14222 case ISD::FMUL: 14223 case ISD::ADD: 14224 case ISD::UMIN: 14225 case ISD::UMAX: 14226 case ISD::SMIN: 14227 case ISD::SMAX: 14228 case ISD::FMAXNUM: 14229 case ISD::FMINNUM: 14230 case ISD::FMAXNUM_IEEE: 14231 case ISD::FMINNUM_IEEE: 14232 case ISD::FMAXIMUM: 14233 case ISD::FMINIMUM: { 14234 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, 14235 Vec.getOperand(0), Idx); 14236 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, 14237 Vec.getOperand(1), Idx); 14238 14239 DCI.AddToWorklist(Elt0.getNode()); 14240 DCI.AddToWorklist(Elt1.getNode()); 14241 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); 14242 } 14243 } 14244 } 14245 14246 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) 14247 if (shouldExpandVectorDynExt(N)) { 14248 SDLoc SL(N); 14249 SDValue Idx = N->getOperand(1); 14250 SDValue V; 14251 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { 14252 SDValue IC = DAG.getVectorIdxConstant(I, SL); 14253 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); 14254 if (I == 0) 14255 V = Elt; 14256 else 14257 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); 14258 } 14259 return V; 14260 } 14261 14262 if (!DCI.isBeforeLegalize()) 14263 return SDValue(); 14264 14265 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit 14266 // elements. This exposes more load reduction opportunities by replacing 14267 // multiple small extract_vector_elements with a single 32-bit extract. 14268 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14269 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && 14270 VecSize > 32 && VecSize % 32 == 0 && Idx) { 14271 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); 14272 14273 unsigned BitIndex = Idx->getZExtValue() * VecEltSize; 14274 unsigned EltIdx = BitIndex / 32; 14275 unsigned LeftoverBitIdx = BitIndex % 32; 14276 SDLoc SL(N); 14277 14278 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); 14279 DCI.AddToWorklist(Cast.getNode()); 14280 14281 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, 14282 DAG.getConstant(EltIdx, SL, MVT::i32)); 14283 DCI.AddToWorklist(Elt.getNode()); 14284 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, 14285 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); 14286 DCI.AddToWorklist(Srl.getNode()); 14287 14288 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); 14289 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); 14290 DCI.AddToWorklist(Trunc.getNode()); 14291 14292 if (VecEltVT == ResVT) { 14293 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); 14294 } 14295 14296 assert(ResVT.isScalarInteger()); 14297 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); 14298 } 14299 14300 return SDValue(); 14301 } 14302 14303 SDValue 14304 SITargetLowering::performInsertVectorEltCombine(SDNode *N, 14305 DAGCombinerInfo &DCI) const { 14306 SDValue Vec = N->getOperand(0); 14307 SDValue Idx = N->getOperand(2); 14308 EVT VecVT = Vec.getValueType(); 14309 EVT EltVT = VecVT.getVectorElementType(); 14310 14311 // INSERT_VECTOR_ELT (<n x e>, var-idx) 14312 // => BUILD_VECTOR n x select (e, const-idx) 14313 if (!shouldExpandVectorDynExt(N)) 14314 return SDValue(); 14315 14316 SelectionDAG &DAG = DCI.DAG; 14317 SDLoc SL(N); 14318 SDValue Ins = N->getOperand(1); 14319 EVT IdxVT = Idx.getValueType(); 14320 14321 SmallVector<SDValue, 16> Ops; 14322 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { 14323 SDValue IC = DAG.getConstant(I, SL, IdxVT); 14324 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); 14325 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ); 14326 Ops.push_back(V); 14327 } 14328 14329 return DAG.getBuildVector(VecVT, SL, Ops); 14330 } 14331 14332 /// Return the source of an fp_extend from f16 to f32, or a converted FP 14333 /// constant. 14334 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { 14335 if (Src.getOpcode() == ISD::FP_EXTEND && 14336 Src.getOperand(0).getValueType() == MVT::f16) { 14337 return Src.getOperand(0); 14338 } 14339 14340 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) { 14341 APFloat Val = CFP->getValueAPF(); 14342 bool LosesInfo = true; 14343 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 14344 if (!LosesInfo) 14345 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16); 14346 } 14347 14348 return SDValue(); 14349 } 14350 14351 SDValue SITargetLowering::performFPRoundCombine(SDNode *N, 14352 DAGCombinerInfo &DCI) const { 14353 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && 14354 "combine only useful on gfx8"); 14355 14356 SDValue TruncSrc = N->getOperand(0); 14357 EVT VT = N->getValueType(0); 14358 if (VT != MVT::f16) 14359 return SDValue(); 14360 14361 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || 14362 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse()) 14363 return SDValue(); 14364 14365 SelectionDAG &DAG = DCI.DAG; 14366 SDLoc SL(N); 14367 14368 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, 14369 // and expanding it with min/max saves 1 instruction vs. casting to f32 and 14370 // casting back. 14371 14372 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => 14373 // fmin(fmax(a, b), fmax(fmin(a, b), c)) 14374 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0)); 14375 if (!A) 14376 return SDValue(); 14377 14378 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1)); 14379 if (!B) 14380 return SDValue(); 14381 14382 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2)); 14383 if (!C) 14384 return SDValue(); 14385 14386 // This changes signaling nan behavior. If an input is a signaling nan, it 14387 // would have been quieted by the fpext originally. We don't care because 14388 // these are unconstrained ops. If we needed to insert quieting canonicalizes 14389 // we would be worse off than just doing the promotion. 14390 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B); 14391 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B); 14392 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C); 14393 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1); 14394 } 14395 14396 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, 14397 const SDNode *N0, 14398 const SDNode *N1) const { 14399 EVT VT = N0->getValueType(0); 14400 14401 // Only do this if we are not trying to support denormals. v_mad_f32 does not 14402 // support denormals ever. 14403 if (((VT == MVT::f32 && 14404 denormalModeIsFlushAllF32(DAG.getMachineFunction())) || 14405 (VT == MVT::f16 && Subtarget->hasMadF16() && 14406 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) && 14407 isOperationLegal(ISD::FMAD, VT)) 14408 return ISD::FMAD; 14409 14410 const TargetOptions &Options = DAG.getTarget().Options; 14411 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 14412 (N0->getFlags().hasAllowContract() && 14413 N1->getFlags().hasAllowContract())) && 14414 isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { 14415 return ISD::FMA; 14416 } 14417 14418 return 0; 14419 } 14420 14421 // For a reassociatable opcode perform: 14422 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform 14423 SDValue SITargetLowering::reassociateScalarOps(SDNode *N, 14424 SelectionDAG &DAG) const { 14425 EVT VT = N->getValueType(0); 14426 if (VT != MVT::i32 && VT != MVT::i64) 14427 return SDValue(); 14428 14429 if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) 14430 return SDValue(); 14431 14432 unsigned Opc = N->getOpcode(); 14433 SDValue Op0 = N->getOperand(0); 14434 SDValue Op1 = N->getOperand(1); 14435 14436 if (!(Op0->isDivergent() ^ Op1->isDivergent())) 14437 return SDValue(); 14438 14439 if (Op0->isDivergent()) 14440 std::swap(Op0, Op1); 14441 14442 if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) 14443 return SDValue(); 14444 14445 SDValue Op2 = Op1.getOperand(1); 14446 Op1 = Op1.getOperand(0); 14447 if (!(Op1->isDivergent() ^ Op2->isDivergent())) 14448 return SDValue(); 14449 14450 if (Op1->isDivergent()) 14451 std::swap(Op1, Op2); 14452 14453 SDLoc SL(N); 14454 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); 14455 return DAG.getNode(Opc, SL, VT, Add1, Op2); 14456 } 14457 14458 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, 14459 SDValue N0, SDValue N1, SDValue N2, bool Signed) { 14460 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; 14461 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); 14462 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2); 14463 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); 14464 } 14465 14466 // Fold 14467 // y = lshr i64 x, 32 14468 // res = add (mul i64 y, Const), x where "Const" is a 64-bit constant 14469 // with Const.hi == -1 14470 // To 14471 // res = mad_u64_u32 y.lo ,Const.lo, x.lo 14472 static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, 14473 SDValue MulLHS, SDValue MulRHS, 14474 SDValue AddRHS) { 14475 if (MulRHS.getOpcode() == ISD::SRL) 14476 std::swap(MulLHS, MulRHS); 14477 14478 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL) 14479 return SDValue(); 14480 14481 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1)); 14482 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 || 14483 MulLHS.getOperand(0) != AddRHS) 14484 return SDValue(); 14485 14486 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode()); 14487 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1)) 14488 return SDValue(); 14489 14490 SDValue ConstMul = 14491 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32); 14492 return getMad64_32(DAG, SL, MVT::i64, 14493 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul, 14494 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false); 14495 } 14496 14497 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high 14498 // multiplies, if any. 14499 // 14500 // Full 64-bit multiplies that feed into an addition are lowered here instead 14501 // of using the generic expansion. The generic expansion ends up with 14502 // a tree of ADD nodes that prevents us from using the "add" part of the 14503 // MAD instruction. The expansion produced here results in a chain of ADDs 14504 // instead of a tree. 14505 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, 14506 DAGCombinerInfo &DCI) const { 14507 assert(N->getOpcode() == ISD::ADD); 14508 14509 SelectionDAG &DAG = DCI.DAG; 14510 EVT VT = N->getValueType(0); 14511 SDLoc SL(N); 14512 SDValue LHS = N->getOperand(0); 14513 SDValue RHS = N->getOperand(1); 14514 14515 if (VT.isVector()) 14516 return SDValue(); 14517 14518 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall 14519 // result in scalar registers for uniform values. 14520 if (!N->isDivergent() && Subtarget->hasSMulHi()) 14521 return SDValue(); 14522 14523 unsigned NumBits = VT.getScalarSizeInBits(); 14524 if (NumBits <= 32 || NumBits > 64) 14525 return SDValue(); 14526 14527 if (LHS.getOpcode() != ISD::MUL) { 14528 assert(RHS.getOpcode() == ISD::MUL); 14529 std::swap(LHS, RHS); 14530 } 14531 14532 // Avoid the fold if it would unduly increase the number of multiplies due to 14533 // multiple uses, except on hardware with full-rate multiply-add (which is 14534 // part of full-rate 64-bit ops). 14535 if (!Subtarget->hasFullRate64Ops()) { 14536 unsigned NumUsers = 0; 14537 for (SDNode *User : LHS->users()) { 14538 // There is a use that does not feed into addition, so the multiply can't 14539 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. 14540 if (User->getOpcode() != ISD::ADD) 14541 return SDValue(); 14542 14543 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer 14544 // MUL + 3xADD + 3xADDC over 3xMAD. 14545 ++NumUsers; 14546 if (NumUsers >= 3) 14547 return SDValue(); 14548 } 14549 } 14550 14551 SDValue MulLHS = LHS.getOperand(0); 14552 SDValue MulRHS = LHS.getOperand(1); 14553 SDValue AddRHS = RHS; 14554 14555 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS)) 14556 return FoldedMAD; 14557 14558 // Always check whether operands are small unsigned values, since that 14559 // knowledge is useful in more cases. Check for small signed values only if 14560 // doing so can unlock a shorter code sequence. 14561 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32; 14562 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32; 14563 14564 bool MulSignedLo = false; 14565 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { 14566 MulSignedLo = 14567 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32; 14568 } 14569 14570 // The operands and final result all have the same number of bits. If 14571 // operands need to be extended, they can be extended with garbage. The 14572 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is 14573 // truncated away in the end. 14574 if (VT != MVT::i64) { 14575 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS); 14576 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS); 14577 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS); 14578 } 14579 14580 // The basic code generated is conceptually straightforward. Pseudo code: 14581 // 14582 // accum = mad_64_32 lhs.lo, rhs.lo, accum 14583 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi 14584 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi 14585 // 14586 // The second and third lines are optional, depending on whether the factors 14587 // are {sign,zero}-extended or not. 14588 // 14589 // The actual DAG is noisier than the pseudo code, but only due to 14590 // instructions that disassemble values into low and high parts, and 14591 // assemble the final result. 14592 SDValue One = DAG.getConstant(1, SL, MVT::i32); 14593 14594 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); 14595 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS); 14596 SDValue Accum = 14597 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); 14598 14599 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { 14600 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32); 14601 14602 if (!MulLHSUnsigned32) { 14603 auto MulLHSHi = 14604 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One); 14605 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo); 14606 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); 14607 } 14608 14609 if (!MulRHSUnsigned32) { 14610 auto MulRHSHi = 14611 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One); 14612 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi); 14613 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); 14614 } 14615 14616 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi}); 14617 Accum = DAG.getBitcast(MVT::i64, Accum); 14618 } 14619 14620 if (VT != MVT::i64) 14621 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum); 14622 return Accum; 14623 } 14624 14625 SDValue 14626 SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, 14627 DAGCombinerInfo &DCI) const { 14628 SDValue RHS = N->getOperand(1); 14629 auto *CRHS = dyn_cast<ConstantSDNode>(RHS); 14630 if (!CRHS) 14631 return SDValue(); 14632 14633 // TODO: Worth using computeKnownBits? Maybe expensive since it's so 14634 // common. 14635 uint64_t Val = CRHS->getZExtValue(); 14636 if (countr_zero(Val) >= 32) { 14637 SelectionDAG &DAG = DCI.DAG; 14638 SDLoc SL(N); 14639 SDValue LHS = N->getOperand(0); 14640 14641 // Avoid carry machinery if we know the low half of the add does not 14642 // contribute to the final result. 14643 // 14644 // add i64:x, K if computeTrailingZeros(K) >= 32 14645 // => build_pair (add x.hi, K.hi), x.lo 14646 14647 // Breaking the 64-bit add here with this strange constant is unlikely 14648 // to interfere with addressing mode patterns. 14649 14650 SDValue Hi = getHiHalf64(LHS, DAG); 14651 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); 14652 SDValue AddHi = 14653 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); 14654 14655 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); 14656 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); 14657 } 14658 14659 return SDValue(); 14660 } 14661 14662 // Collect the ultimate src of each of the mul node's operands, and confirm 14663 // each operand is 8 bytes. 14664 static std::optional<ByteProvider<SDValue>> 14665 handleMulOperand(const SDValue &MulOperand) { 14666 auto Byte0 = calculateByteProvider(MulOperand, 0, 0); 14667 if (!Byte0 || Byte0->isConstantZero()) { 14668 return std::nullopt; 14669 } 14670 auto Byte1 = calculateByteProvider(MulOperand, 1, 0); 14671 if (Byte1 && !Byte1->isConstantZero()) { 14672 return std::nullopt; 14673 } 14674 return Byte0; 14675 } 14676 14677 static unsigned addPermMasks(unsigned First, unsigned Second) { 14678 unsigned FirstCs = First & 0x0c0c0c0c; 14679 unsigned SecondCs = Second & 0x0c0c0c0c; 14680 unsigned FirstNoCs = First & ~0x0c0c0c0c; 14681 unsigned SecondNoCs = Second & ~0x0c0c0c0c; 14682 14683 assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); 14684 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); 14685 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); 14686 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); 14687 14688 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); 14689 } 14690 14691 struct DotSrc { 14692 SDValue SrcOp; 14693 int64_t PermMask; 14694 int64_t DWordOffset; 14695 }; 14696 14697 static void placeSources(ByteProvider<SDValue> &Src0, 14698 ByteProvider<SDValue> &Src1, 14699 SmallVectorImpl<DotSrc> &Src0s, 14700 SmallVectorImpl<DotSrc> &Src1s, int Step) { 14701 14702 assert(Src0.Src.has_value() && Src1.Src.has_value()); 14703 // Src0s and Src1s are empty, just place arbitrarily. 14704 if (Step == 0) { 14705 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c, 14706 Src0.SrcOffset / 4}); 14707 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c, 14708 Src1.SrcOffset / 4}); 14709 return; 14710 } 14711 14712 for (int BPI = 0; BPI < 2; BPI++) { 14713 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; 14714 if (BPI == 1) { 14715 BPP = {Src1, Src0}; 14716 } 14717 unsigned ZeroMask = 0x0c0c0c0c; 14718 unsigned FMask = 0xFF << (8 * (3 - Step)); 14719 14720 unsigned FirstMask = 14721 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); 14722 unsigned SecondMask = 14723 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); 14724 // Attempt to find Src vector which contains our SDValue, if so, add our 14725 // perm mask to the existing one. If we are unable to find a match for the 14726 // first SDValue, attempt to find match for the second. 14727 int FirstGroup = -1; 14728 for (int I = 0; I < 2; I++) { 14729 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s; 14730 auto MatchesFirst = [&BPP](DotSrc &IterElt) { 14731 return IterElt.SrcOp == *BPP.first.Src && 14732 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4)); 14733 }; 14734 14735 auto *Match = llvm::find_if(Srcs, MatchesFirst); 14736 if (Match != Srcs.end()) { 14737 Match->PermMask = addPermMasks(FirstMask, Match->PermMask); 14738 FirstGroup = I; 14739 break; 14740 } 14741 } 14742 if (FirstGroup != -1) { 14743 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s; 14744 auto MatchesSecond = [&BPP](DotSrc &IterElt) { 14745 return IterElt.SrcOp == *BPP.second.Src && 14746 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4)); 14747 }; 14748 auto *Match = llvm::find_if(Srcs, MatchesSecond); 14749 if (Match != Srcs.end()) { 14750 Match->PermMask = addPermMasks(SecondMask, Match->PermMask); 14751 } else 14752 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4}); 14753 return; 14754 } 14755 } 14756 14757 // If we have made it here, then we could not find a match in Src0s or Src1s 14758 // for either Src0 or Src1, so just place them arbitrarily. 14759 14760 unsigned ZeroMask = 0x0c0c0c0c; 14761 unsigned FMask = 0xFF << (8 * (3 - Step)); 14762 14763 Src0s.push_back( 14764 {*Src0.Src, 14765 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), 14766 Src0.SrcOffset / 4}); 14767 Src1s.push_back( 14768 {*Src1.Src, 14769 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), 14770 Src1.SrcOffset / 4}); 14771 } 14772 14773 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, 14774 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned, 14775 bool IsAny) { 14776 14777 // If we just have one source, just permute it accordingly. 14778 if (Srcs.size() == 1) { 14779 auto *Elt = Srcs.begin(); 14780 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset); 14781 14782 // v_perm will produce the original value 14783 if (Elt->PermMask == 0x3020100) 14784 return EltOp; 14785 14786 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, 14787 DAG.getConstant(Elt->PermMask, SL, MVT::i32)); 14788 } 14789 14790 auto *FirstElt = Srcs.begin(); 14791 auto *SecondElt = std::next(FirstElt); 14792 14793 SmallVector<SDValue, 2> Perms; 14794 14795 // If we have multiple sources in the chain, combine them via perms (using 14796 // calculated perm mask) and Ors. 14797 while (true) { 14798 auto FirstMask = FirstElt->PermMask; 14799 auto SecondMask = SecondElt->PermMask; 14800 14801 unsigned FirstCs = FirstMask & 0x0c0c0c0c; 14802 unsigned FirstPlusFour = FirstMask | 0x04040404; 14803 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any 14804 // original 0x0C. 14805 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; 14806 14807 auto PermMask = addPermMasks(FirstMask, SecondMask); 14808 auto FirstVal = 14809 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 14810 auto SecondVal = 14811 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset); 14812 14813 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, 14814 SecondVal, 14815 DAG.getConstant(PermMask, SL, MVT::i32))); 14816 14817 FirstElt = std::next(SecondElt); 14818 if (FirstElt == Srcs.end()) 14819 break; 14820 14821 SecondElt = std::next(FirstElt); 14822 // If we only have a FirstElt, then just combine that into the cumulative 14823 // source node. 14824 if (SecondElt == Srcs.end()) { 14825 auto EltOp = 14826 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 14827 14828 Perms.push_back( 14829 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, 14830 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32))); 14831 break; 14832 } 14833 } 14834 14835 assert(Perms.size() == 1 || Perms.size() == 2); 14836 return Perms.size() == 2 14837 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) 14838 : Perms[0]; 14839 } 14840 14841 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) { 14842 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) { 14843 EntryMask = EntryMask >> ((4 - ChainLength) * 8); 14844 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; 14845 EntryMask += ZeroMask; 14846 } 14847 } 14848 14849 static bool isMul(const SDValue Op) { 14850 auto Opcode = Op.getOpcode(); 14851 14852 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || 14853 Opcode == AMDGPUISD::MUL_I24); 14854 } 14855 14856 static std::optional<bool> 14857 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, 14858 ByteProvider<SDValue> &Src1, const SDValue &S0Op, 14859 const SDValue &S1Op, const SelectionDAG &DAG) { 14860 // If we both ops are i8s (pre legalize-dag), then the signedness semantics 14861 // of the dot4 is irrelevant. 14862 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) 14863 return false; 14864 14865 auto Known0 = DAG.computeKnownBits(S0Op, 0); 14866 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; 14867 bool S0IsSigned = Known0.countMinLeadingOnes() > 0; 14868 auto Known1 = DAG.computeKnownBits(S1Op, 0); 14869 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; 14870 bool S1IsSigned = Known1.countMinLeadingOnes() > 0; 14871 14872 assert(!(S0IsUnsigned && S0IsSigned)); 14873 assert(!(S1IsUnsigned && S1IsSigned)); 14874 14875 // There are 9 possible permutations of 14876 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} 14877 14878 // In two permutations, the sign bits are known to be the same for both Ops, 14879 // so simply return Signed / Unsigned corresponding to the MSB 14880 14881 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) 14882 return S0IsSigned; 14883 14884 // In another two permutations, the sign bits are known to be opposite. In 14885 // this case return std::nullopt to indicate a bad match. 14886 14887 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) 14888 return std::nullopt; 14889 14890 // In the remaining five permutations, we don't know the value of the sign 14891 // bit for at least one Op. Since we have a valid ByteProvider, we know that 14892 // the upper bits must be extension bits. Thus, the only ways for the sign 14893 // bit to be unknown is if it was sign extended from unknown value, or if it 14894 // was any extended. In either case, it is correct to use the signed 14895 // version of the signedness semantics of dot4 14896 14897 // In two of such permutations, we known the sign bit is set for 14898 // one op, and the other is unknown. It is okay to used signed version of 14899 // dot4. 14900 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || 14901 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) 14902 return true; 14903 14904 // In one such permutation, we don't know either of the sign bits. It is okay 14905 // to used the signed version of dot4. 14906 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) 14907 return true; 14908 14909 // In two of such permutations, we known the sign bit is unset for 14910 // one op, and the other is unknown. Return std::nullopt to indicate a 14911 // bad match. 14912 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || 14913 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) 14914 return std::nullopt; 14915 14916 llvm_unreachable("Fully covered condition"); 14917 } 14918 14919 SDValue SITargetLowering::performAddCombine(SDNode *N, 14920 DAGCombinerInfo &DCI) const { 14921 SelectionDAG &DAG = DCI.DAG; 14922 EVT VT = N->getValueType(0); 14923 SDLoc SL(N); 14924 SDValue LHS = N->getOperand(0); 14925 SDValue RHS = N->getOperand(1); 14926 14927 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { 14928 if (Subtarget->hasMad64_32()) { 14929 if (SDValue Folded = tryFoldToMad64_32(N, DCI)) 14930 return Folded; 14931 } 14932 } 14933 14934 if (SDValue V = reassociateScalarOps(N, DAG)) { 14935 return V; 14936 } 14937 14938 if (VT == MVT::i64) { 14939 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) 14940 return Folded; 14941 } 14942 14943 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && 14944 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { 14945 SDValue TempNode(N, 0); 14946 std::optional<bool> IsSigned; 14947 SmallVector<DotSrc, 4> Src0s; 14948 SmallVector<DotSrc, 4> Src1s; 14949 SmallVector<SDValue, 4> Src2s; 14950 14951 // Match the v_dot4 tree, while collecting src nodes. 14952 int ChainLength = 0; 14953 for (int I = 0; I < 4; I++) { 14954 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; 14955 if (MulIdx == -1) 14956 break; 14957 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); 14958 if (!Src0) 14959 break; 14960 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); 14961 if (!Src1) 14962 break; 14963 14964 auto IterIsSigned = checkDot4MulSignedness( 14965 TempNode->getOperand(MulIdx), *Src0, *Src1, 14966 TempNode->getOperand(MulIdx)->getOperand(0), 14967 TempNode->getOperand(MulIdx)->getOperand(1), DAG); 14968 if (!IterIsSigned) 14969 break; 14970 if (!IsSigned) 14971 IsSigned = *IterIsSigned; 14972 if (*IterIsSigned != *IsSigned) 14973 break; 14974 placeSources(*Src0, *Src1, Src0s, Src1s, I); 14975 auto AddIdx = 1 - MulIdx; 14976 // Allow the special case where add (add (mul24, 0), mul24) became -> 14977 // add (mul24, mul24). 14978 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { 14979 Src2s.push_back(TempNode->getOperand(AddIdx)); 14980 auto Src0 = 14981 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); 14982 if (!Src0) 14983 break; 14984 auto Src1 = 14985 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); 14986 if (!Src1) 14987 break; 14988 auto IterIsSigned = checkDot4MulSignedness( 14989 TempNode->getOperand(AddIdx), *Src0, *Src1, 14990 TempNode->getOperand(AddIdx)->getOperand(0), 14991 TempNode->getOperand(AddIdx)->getOperand(1), DAG); 14992 if (!IterIsSigned) 14993 break; 14994 assert(IsSigned); 14995 if (*IterIsSigned != *IsSigned) 14996 break; 14997 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1); 14998 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); 14999 ChainLength = I + 2; 15000 break; 15001 } 15002 15003 TempNode = TempNode->getOperand(AddIdx); 15004 Src2s.push_back(TempNode); 15005 ChainLength = I + 1; 15006 if (TempNode->getNumOperands() < 2) 15007 break; 15008 LHS = TempNode->getOperand(0); 15009 RHS = TempNode->getOperand(1); 15010 } 15011 15012 if (ChainLength < 2) 15013 return SDValue(); 15014 15015 // Masks were constructed with assumption that we would find a chain of 15016 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of 15017 // 0x0c) so they do not affect dot calculation. 15018 if (ChainLength < 4) { 15019 fixMasks(Src0s, ChainLength); 15020 fixMasks(Src1s, ChainLength); 15021 } 15022 15023 SDValue Src0, Src1; 15024 15025 // If we are just using a single source for both, and have permuted the 15026 // bytes consistently, we can just use the sources without permuting 15027 // (commutation). 15028 bool UseOriginalSrc = false; 15029 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && 15030 Src0s.begin()->PermMask == Src1s.begin()->PermMask && 15031 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && 15032 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { 15033 SmallVector<unsigned, 4> SrcBytes; 15034 auto Src0Mask = Src0s.begin()->PermMask; 15035 SrcBytes.push_back(Src0Mask & 0xFF000000); 15036 bool UniqueEntries = true; 15037 for (auto I = 1; I < 4; I++) { 15038 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); 15039 15040 if (is_contained(SrcBytes, NextByte)) { 15041 UniqueEntries = false; 15042 break; 15043 } 15044 SrcBytes.push_back(NextByte); 15045 } 15046 15047 if (UniqueEntries) { 15048 UseOriginalSrc = true; 15049 15050 auto *FirstElt = Src0s.begin(); 15051 auto FirstEltOp = 15052 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 15053 15054 auto *SecondElt = Src1s.begin(); 15055 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp, 15056 SecondElt->DWordOffset); 15057 15058 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL, 15059 MVT::getIntegerVT(32)); 15060 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL, 15061 MVT::getIntegerVT(32)); 15062 } 15063 } 15064 15065 if (!UseOriginalSrc) { 15066 Src0 = resolveSources(DAG, SL, Src0s, false, true); 15067 Src1 = resolveSources(DAG, SL, Src1s, false, true); 15068 } 15069 15070 assert(IsSigned); 15071 SDValue Src2 = 15072 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); 15073 15074 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 15075 : Intrinsic::amdgcn_udot4, 15076 SL, MVT::i64); 15077 15078 assert(!VT.isVector()); 15079 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, 15080 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); 15081 15082 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); 15083 } 15084 15085 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) 15086 return SDValue(); 15087 15088 // add x, zext (setcc) => uaddo_carry x, 0, setcc 15089 // add x, sext (setcc) => usubo_carry x, 0, setcc 15090 unsigned Opc = LHS.getOpcode(); 15091 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || 15092 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY) 15093 std::swap(RHS, LHS); 15094 15095 Opc = RHS.getOpcode(); 15096 switch (Opc) { 15097 default: 15098 break; 15099 case ISD::ZERO_EXTEND: 15100 case ISD::SIGN_EXTEND: 15101 case ISD::ANY_EXTEND: { 15102 auto Cond = RHS.getOperand(0); 15103 // If this won't be a real VOPC output, we would still need to insert an 15104 // extra instruction anyway. 15105 if (!isBoolSGPR(Cond)) 15106 break; 15107 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 15108 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond}; 15109 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY; 15110 return DAG.getNode(Opc, SL, VTList, Args); 15111 } 15112 case ISD::UADDO_CARRY: { 15113 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc 15114 if (!isNullConstant(RHS.getOperand(1))) 15115 break; 15116 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)}; 15117 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args); 15118 } 15119 } 15120 return SDValue(); 15121 } 15122 15123 SDValue SITargetLowering::performPtrAddCombine(SDNode *N, 15124 DAGCombinerInfo &DCI) const { 15125 SelectionDAG &DAG = DCI.DAG; 15126 SDLoc DL(N); 15127 SDValue N0 = N->getOperand(0); 15128 SDValue N1 = N->getOperand(1); 15129 15130 if (N1.getOpcode() == ISD::ADD) { 15131 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, 15132 // y is not, and (add y, z) is used only once. 15133 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, 15134 // z is not, and (add y, z) is used only once. 15135 // The goal is to move constant offsets to the outermost ptradd, to create 15136 // more opportunities to fold offsets into memory instructions. 15137 // Together with the generic combines in DAGCombiner.cpp, this also 15138 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). 15139 // 15140 // This transform is here instead of in the general DAGCombiner as it can 15141 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for 15142 // AArch64's CPA. 15143 SDValue X = N0; 15144 SDValue Y = N1.getOperand(0); 15145 SDValue Z = N1.getOperand(1); 15146 if (N1.hasOneUse()) { 15147 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); 15148 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); 15149 if (ZIsConstant != YIsConstant) { 15150 // If both additions in the original were NUW, the new ones are as well. 15151 SDNodeFlags Flags = 15152 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; 15153 if (YIsConstant) 15154 std::swap(Y, Z); 15155 15156 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); 15157 DCI.AddToWorklist(Inner.getNode()); 15158 return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); 15159 } 15160 } 15161 } 15162 15163 return SDValue(); 15164 } 15165 15166 SDValue SITargetLowering::performSubCombine(SDNode *N, 15167 DAGCombinerInfo &DCI) const { 15168 SelectionDAG &DAG = DCI.DAG; 15169 EVT VT = N->getValueType(0); 15170 15171 if (VT == MVT::i64) { 15172 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) 15173 return Folded; 15174 } 15175 15176 if (VT != MVT::i32) 15177 return SDValue(); 15178 15179 SDLoc SL(N); 15180 SDValue LHS = N->getOperand(0); 15181 SDValue RHS = N->getOperand(1); 15182 15183 // sub x, zext (setcc) => usubo_carry x, 0, setcc 15184 // sub x, sext (setcc) => uaddo_carry x, 0, setcc 15185 unsigned Opc = RHS.getOpcode(); 15186 switch (Opc) { 15187 default: 15188 break; 15189 case ISD::ZERO_EXTEND: 15190 case ISD::SIGN_EXTEND: 15191 case ISD::ANY_EXTEND: { 15192 auto Cond = RHS.getOperand(0); 15193 // If this won't be a real VOPC output, we would still need to insert an 15194 // extra instruction anyway. 15195 if (!isBoolSGPR(Cond)) 15196 break; 15197 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 15198 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond}; 15199 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; 15200 return DAG.getNode(Opc, SL, VTList, Args); 15201 } 15202 } 15203 15204 if (LHS.getOpcode() == ISD::USUBO_CARRY) { 15205 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc 15206 if (!isNullConstant(LHS.getOperand(1))) 15207 return SDValue(); 15208 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)}; 15209 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); 15210 } 15211 return SDValue(); 15212 } 15213 15214 SDValue 15215 SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, 15216 DAGCombinerInfo &DCI) const { 15217 15218 if (N->getValueType(0) != MVT::i32) 15219 return SDValue(); 15220 15221 if (!isNullConstant(N->getOperand(1))) 15222 return SDValue(); 15223 15224 SelectionDAG &DAG = DCI.DAG; 15225 SDValue LHS = N->getOperand(0); 15226 15227 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc 15228 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc 15229 unsigned LHSOpc = LHS.getOpcode(); 15230 unsigned Opc = N->getOpcode(); 15231 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) || 15232 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) { 15233 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)}; 15234 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); 15235 } 15236 return SDValue(); 15237 } 15238 15239 SDValue SITargetLowering::performFAddCombine(SDNode *N, 15240 DAGCombinerInfo &DCI) const { 15241 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 15242 return SDValue(); 15243 15244 SelectionDAG &DAG = DCI.DAG; 15245 EVT VT = N->getValueType(0); 15246 15247 SDLoc SL(N); 15248 SDValue LHS = N->getOperand(0); 15249 SDValue RHS = N->getOperand(1); 15250 15251 // These should really be instruction patterns, but writing patterns with 15252 // source modifiers is a pain. 15253 15254 // fadd (fadd (a, a), b) -> mad 2.0, a, b 15255 if (LHS.getOpcode() == ISD::FADD) { 15256 SDValue A = LHS.getOperand(0); 15257 if (A == LHS.getOperand(1)) { 15258 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 15259 if (FusedOp != 0) { 15260 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 15261 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS); 15262 } 15263 } 15264 } 15265 15266 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 15267 if (RHS.getOpcode() == ISD::FADD) { 15268 SDValue A = RHS.getOperand(0); 15269 if (A == RHS.getOperand(1)) { 15270 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 15271 if (FusedOp != 0) { 15272 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 15273 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS); 15274 } 15275 } 15276 } 15277 15278 return SDValue(); 15279 } 15280 15281 SDValue SITargetLowering::performFSubCombine(SDNode *N, 15282 DAGCombinerInfo &DCI) const { 15283 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 15284 return SDValue(); 15285 15286 SelectionDAG &DAG = DCI.DAG; 15287 SDLoc SL(N); 15288 EVT VT = N->getValueType(0); 15289 assert(!VT.isVector()); 15290 15291 // Try to get the fneg to fold into the source modifier. This undoes generic 15292 // DAG combines and folds them into the mad. 15293 // 15294 // Only do this if we are not trying to support denormals. v_mad_f32 does 15295 // not support denormals ever. 15296 SDValue LHS = N->getOperand(0); 15297 SDValue RHS = N->getOperand(1); 15298 if (LHS.getOpcode() == ISD::FADD) { 15299 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 15300 SDValue A = LHS.getOperand(0); 15301 if (A == LHS.getOperand(1)) { 15302 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 15303 if (FusedOp != 0) { 15304 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 15305 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 15306 15307 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS); 15308 } 15309 } 15310 } 15311 15312 if (RHS.getOpcode() == ISD::FADD) { 15313 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 15314 15315 SDValue A = RHS.getOperand(0); 15316 if (A == RHS.getOperand(1)) { 15317 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 15318 if (FusedOp != 0) { 15319 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); 15320 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); 15321 } 15322 } 15323 } 15324 15325 return SDValue(); 15326 } 15327 15328 SDValue SITargetLowering::performFDivCombine(SDNode *N, 15329 DAGCombinerInfo &DCI) const { 15330 SelectionDAG &DAG = DCI.DAG; 15331 SDLoc SL(N); 15332 EVT VT = N->getValueType(0); 15333 if (VT != MVT::f16 || !Subtarget->has16BitInsts()) 15334 return SDValue(); 15335 15336 SDValue LHS = N->getOperand(0); 15337 SDValue RHS = N->getOperand(1); 15338 15339 SDNodeFlags Flags = N->getFlags(); 15340 SDNodeFlags RHSFlags = RHS->getFlags(); 15341 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || 15342 !RHS->hasOneUse()) 15343 return SDValue(); 15344 15345 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 15346 bool IsNegative = false; 15347 if (CLHS->isExactlyValue(1.0) || 15348 (IsNegative = CLHS->isExactlyValue(-1.0))) { 15349 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 15350 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 15351 if (RHS.getOpcode() == ISD::FSQRT) { 15352 // TODO: Or in RHS flags, somehow missing from SDNodeFlags 15353 SDValue Rsq = 15354 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); 15355 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; 15356 } 15357 } 15358 } 15359 15360 return SDValue(); 15361 } 15362 15363 SDValue SITargetLowering::performFMulCombine(SDNode *N, 15364 DAGCombinerInfo &DCI) const { 15365 SelectionDAG &DAG = DCI.DAG; 15366 EVT VT = N->getValueType(0); 15367 EVT ScalarVT = VT.getScalarType(); 15368 EVT IntVT = VT.changeElementType(MVT::i32); 15369 15370 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() && 15371 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) { 15372 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32. 15373 return SDValue(); 15374 } 15375 15376 SDValue LHS = N->getOperand(0); 15377 SDValue RHS = N->getOperand(1); 15378 15379 // It is cheaper to realize i32 inline constants as compared against 15380 // materializing f16 or f64 (or even non-inline f32) values, 15381 // possible via ldexp usage, as shown below : 15382 // 15383 // Given : A = 2^a & B = 2^b ; where a and b are integers. 15384 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) ) 15385 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) ) 15386 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) && 15387 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) { 15388 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1)); 15389 if (!TrueNode) 15390 return SDValue(); 15391 const ConstantFPSDNode *FalseNode = 15392 isConstOrConstSplatFP(RHS.getOperand(2)); 15393 if (!FalseNode) 15394 return SDValue(); 15395 15396 if (TrueNode->isNegative() != FalseNode->isNegative()) 15397 return SDValue(); 15398 15399 // For f32, only non-inline constants should be transformed. 15400 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15401 if (ScalarVT == MVT::f32 && 15402 TII->isInlineConstant(TrueNode->getValueAPF()) && 15403 TII->isInlineConstant(FalseNode->getValueAPF())) 15404 return SDValue(); 15405 15406 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs(); 15407 if (TrueNodeExpVal == INT_MIN) 15408 return SDValue(); 15409 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs(); 15410 if (FalseNodeExpVal == INT_MIN) 15411 return SDValue(); 15412 15413 SDLoc SL(N); 15414 SDValue SelectNode = 15415 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0), 15416 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT), 15417 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT)); 15418 15419 LHS = TrueNode->isNegative() 15420 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags()) 15421 : LHS; 15422 15423 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags()); 15424 } 15425 15426 return SDValue(); 15427 } 15428 15429 SDValue SITargetLowering::performFMACombine(SDNode *N, 15430 DAGCombinerInfo &DCI) const { 15431 SelectionDAG &DAG = DCI.DAG; 15432 EVT VT = N->getValueType(0); 15433 SDLoc SL(N); 15434 15435 if (!Subtarget->hasDot10Insts() || VT != MVT::f32) 15436 return SDValue(); 15437 15438 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> 15439 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) 15440 SDValue Op1 = N->getOperand(0); 15441 SDValue Op2 = N->getOperand(1); 15442 SDValue FMA = N->getOperand(2); 15443 15444 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND || 15445 Op2.getOpcode() != ISD::FP_EXTEND) 15446 return SDValue(); 15447 15448 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, 15449 // regardless of the denorm mode setting. Therefore, 15450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. 15451 const TargetOptions &Options = DAG.getTarget().Options; 15452 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 15453 (N->getFlags().hasAllowContract() && 15454 FMA->getFlags().hasAllowContract())) { 15455 Op1 = Op1.getOperand(0); 15456 Op2 = Op2.getOperand(0); 15457 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 15458 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 15459 return SDValue(); 15460 15461 SDValue Vec1 = Op1.getOperand(0); 15462 SDValue Idx1 = Op1.getOperand(1); 15463 SDValue Vec2 = Op2.getOperand(0); 15464 15465 SDValue FMAOp1 = FMA.getOperand(0); 15466 SDValue FMAOp2 = FMA.getOperand(1); 15467 SDValue FMAAcc = FMA.getOperand(2); 15468 15469 if (FMAOp1.getOpcode() != ISD::FP_EXTEND || 15470 FMAOp2.getOpcode() != ISD::FP_EXTEND) 15471 return SDValue(); 15472 15473 FMAOp1 = FMAOp1.getOperand(0); 15474 FMAOp2 = FMAOp2.getOperand(0); 15475 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 15476 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 15477 return SDValue(); 15478 15479 SDValue Vec3 = FMAOp1.getOperand(0); 15480 SDValue Vec4 = FMAOp2.getOperand(0); 15481 SDValue Idx2 = FMAOp1.getOperand(1); 15482 15483 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || 15484 // Idx1 and Idx2 cannot be the same. 15485 Idx1 == Idx2) 15486 return SDValue(); 15487 15488 if (Vec1 == Vec2 || Vec3 == Vec4) 15489 return SDValue(); 15490 15491 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) 15492 return SDValue(); 15493 15494 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) { 15495 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, 15496 DAG.getTargetConstant(0, SL, MVT::i1)); 15497 } 15498 } 15499 return SDValue(); 15500 } 15501 15502 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 15503 DAGCombinerInfo &DCI) const { 15504 SelectionDAG &DAG = DCI.DAG; 15505 SDLoc SL(N); 15506 15507 SDValue LHS = N->getOperand(0); 15508 SDValue RHS = N->getOperand(1); 15509 EVT VT = LHS.getValueType(); 15510 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 15511 15512 auto *CRHS = dyn_cast<ConstantSDNode>(RHS); 15513 if (!CRHS) { 15514 CRHS = dyn_cast<ConstantSDNode>(LHS); 15515 if (CRHS) { 15516 std::swap(LHS, RHS); 15517 CC = getSetCCSwappedOperands(CC); 15518 } 15519 } 15520 15521 if (CRHS) { 15522 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && 15523 isBoolSGPR(LHS.getOperand(0))) { 15524 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 15525 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc 15526 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 15527 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc 15528 if ((CRHS->isAllOnes() && 15529 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || 15530 (CRHS->isZero() && 15531 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) 15532 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 15533 DAG.getAllOnesConstant(SL, MVT::i1)); 15534 if ((CRHS->isAllOnes() && 15535 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || 15536 (CRHS->isZero() && 15537 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) 15538 return LHS.getOperand(0); 15539 } 15540 15541 const APInt &CRHSVal = CRHS->getAPIntValue(); 15542 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && 15543 LHS.getOpcode() == ISD::SELECT && 15544 isa<ConstantSDNode>(LHS.getOperand(1)) && 15545 isa<ConstantSDNode>(LHS.getOperand(2)) && 15546 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) && 15547 isBoolSGPR(LHS.getOperand(0))) { 15548 // Given CT != FT: 15549 // setcc (select cc, CT, CF), CF, eq => xor cc, -1 15550 // setcc (select cc, CT, CF), CF, ne => cc 15551 // setcc (select cc, CT, CF), CT, ne => xor cc, -1 15552 // setcc (select cc, CT, CF), CT, eq => cc 15553 const APInt &CT = LHS.getConstantOperandAPInt(1); 15554 const APInt &CF = LHS.getConstantOperandAPInt(2); 15555 15556 if ((CF == CRHSVal && CC == ISD::SETEQ) || 15557 (CT == CRHSVal && CC == ISD::SETNE)) 15558 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 15559 DAG.getAllOnesConstant(SL, MVT::i1)); 15560 if ((CF == CRHSVal && CC == ISD::SETNE) || 15561 (CT == CRHSVal && CC == ISD::SETEQ)) 15562 return LHS.getOperand(0); 15563 } 15564 } 15565 15566 if (VT != MVT::f32 && VT != MVT::f64 && 15567 (!Subtarget->has16BitInsts() || VT != MVT::f16)) 15568 return SDValue(); 15569 15570 // Match isinf/isfinite pattern 15571 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 15572 // (fcmp one (fabs x), inf) -> (fp_class x, 15573 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) 15574 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && 15575 LHS.getOpcode() == ISD::FABS) { 15576 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 15577 if (!CRHS) 15578 return SDValue(); 15579 15580 const APFloat &APF = CRHS->getValueAPF(); 15581 if (APF.isInfinity() && !APF.isNegative()) { 15582 const unsigned IsInfMask = 15583 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 15584 const unsigned IsFiniteMask = 15585 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL | 15586 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL | 15587 SIInstrFlags::P_SUBNORMAL; 15588 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; 15589 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 15590 DAG.getConstant(Mask, SL, MVT::i32)); 15591 } 15592 } 15593 15594 return SDValue(); 15595 } 15596 15597 SDValue 15598 SITargetLowering::performCvtF32UByteNCombine(SDNode *N, 15599 DAGCombinerInfo &DCI) const { 15600 SelectionDAG &DAG = DCI.DAG; 15601 SDLoc SL(N); 15602 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 15603 15604 SDValue Src = N->getOperand(0); 15605 SDValue Shift = N->getOperand(0); 15606 15607 // TODO: Extend type shouldn't matter (assuming legal types). 15608 if (Shift.getOpcode() == ISD::ZERO_EXTEND) 15609 Shift = Shift.getOperand(0); 15610 15611 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { 15612 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x 15613 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x 15614 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 15615 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 15616 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 15617 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) { 15618 SDValue Shifted = DAG.getZExtOrTrunc( 15619 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32); 15620 15621 unsigned ShiftOffset = 8 * Offset; 15622 if (Shift.getOpcode() == ISD::SHL) 15623 ShiftOffset -= C->getZExtValue(); 15624 else 15625 ShiftOffset += C->getZExtValue(); 15626 15627 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { 15628 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, 15629 MVT::f32, Shifted); 15630 } 15631 } 15632 } 15633 15634 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15635 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 15636 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) { 15637 // We simplified Src. If this node is not dead, visit it again so it is 15638 // folded properly. 15639 if (N->getOpcode() != ISD::DELETED_NODE) 15640 DCI.AddToWorklist(N); 15641 return SDValue(N, 0); 15642 } 15643 15644 // Handle (or x, (srl y, 8)) pattern when known bits are zero. 15645 if (SDValue DemandedSrc = 15646 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) 15647 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); 15648 15649 return SDValue(); 15650 } 15651 15652 SDValue SITargetLowering::performClampCombine(SDNode *N, 15653 DAGCombinerInfo &DCI) const { 15654 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 15655 if (!CSrc) 15656 return SDValue(); 15657 15658 const MachineFunction &MF = DCI.DAG.getMachineFunction(); 15659 const APFloat &F = CSrc->getValueAPF(); 15660 APFloat Zero = APFloat::getZero(F.getSemantics()); 15661 if (F < Zero || 15662 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { 15663 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); 15664 } 15665 15666 APFloat One(F.getSemantics(), "1.0"); 15667 if (F > One) 15668 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); 15669 15670 return SDValue(CSrc, 0); 15671 } 15672 15673 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 15674 DAGCombinerInfo &DCI) const { 15675 switch (N->getOpcode()) { 15676 case ISD::ADD: 15677 case ISD::SUB: 15678 case ISD::SHL: 15679 case ISD::SRL: 15680 case ISD::SRA: 15681 case ISD::AND: 15682 case ISD::OR: 15683 case ISD::XOR: 15684 case ISD::MUL: 15685 case ISD::SETCC: 15686 case ISD::SELECT: 15687 case ISD::SMIN: 15688 case ISD::SMAX: 15689 case ISD::UMIN: 15690 case ISD::UMAX: 15691 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI)) 15692 return Res; 15693 break; 15694 default: 15695 break; 15696 } 15697 15698 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) 15699 return SDValue(); 15700 15701 switch (N->getOpcode()) { 15702 case ISD::ADD: 15703 return performAddCombine(N, DCI); 15704 case ISD::PTRADD: 15705 return performPtrAddCombine(N, DCI); 15706 case ISD::SUB: 15707 return performSubCombine(N, DCI); 15708 case ISD::UADDO_CARRY: 15709 case ISD::USUBO_CARRY: 15710 return performAddCarrySubCarryCombine(N, DCI); 15711 case ISD::FADD: 15712 return performFAddCombine(N, DCI); 15713 case ISD::FSUB: 15714 return performFSubCombine(N, DCI); 15715 case ISD::FDIV: 15716 return performFDivCombine(N, DCI); 15717 case ISD::FMUL: 15718 return performFMulCombine(N, DCI); 15719 case ISD::SETCC: 15720 return performSetCCCombine(N, DCI); 15721 case ISD::FMAXNUM: 15722 case ISD::FMINNUM: 15723 case ISD::FMAXNUM_IEEE: 15724 case ISD::FMINNUM_IEEE: 15725 case ISD::FMAXIMUM: 15726 case ISD::FMINIMUM: 15727 case ISD::FMAXIMUMNUM: 15728 case ISD::FMINIMUMNUM: 15729 case ISD::SMAX: 15730 case ISD::SMIN: 15731 case ISD::UMAX: 15732 case ISD::UMIN: 15733 case AMDGPUISD::FMIN_LEGACY: 15734 case AMDGPUISD::FMAX_LEGACY: 15735 return performMinMaxCombine(N, DCI); 15736 case ISD::FMA: 15737 return performFMACombine(N, DCI); 15738 case ISD::AND: 15739 return performAndCombine(N, DCI); 15740 case ISD::OR: 15741 return performOrCombine(N, DCI); 15742 case ISD::FSHR: { 15743 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15744 if (N->getValueType(0) == MVT::i32 && N->isDivergent() && 15745 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 15746 return matchPERM(N, DCI); 15747 } 15748 break; 15749 } 15750 case ISD::XOR: 15751 return performXorCombine(N, DCI); 15752 case ISD::ZERO_EXTEND: 15753 return performZeroExtendCombine(N, DCI); 15754 case ISD::SIGN_EXTEND_INREG: 15755 return performSignExtendInRegCombine(N, DCI); 15756 case AMDGPUISD::FP_CLASS: 15757 return performClassCombine(N, DCI); 15758 case ISD::FCANONICALIZE: 15759 return performFCanonicalizeCombine(N, DCI); 15760 case AMDGPUISD::RCP: 15761 return performRcpCombine(N, DCI); 15762 case ISD::FLDEXP: 15763 case AMDGPUISD::FRACT: 15764 case AMDGPUISD::RSQ: 15765 case AMDGPUISD::RCP_LEGACY: 15766 case AMDGPUISD::RCP_IFLAG: 15767 case AMDGPUISD::RSQ_CLAMP: { 15768 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 15769 SDValue Src = N->getOperand(0); 15770 if (Src.isUndef()) 15771 return Src; 15772 break; 15773 } 15774 case ISD::SINT_TO_FP: 15775 case ISD::UINT_TO_FP: 15776 return performUCharToFloatCombine(N, DCI); 15777 case ISD::FCOPYSIGN: 15778 return performFCopySignCombine(N, DCI); 15779 case AMDGPUISD::CVT_F32_UBYTE0: 15780 case AMDGPUISD::CVT_F32_UBYTE1: 15781 case AMDGPUISD::CVT_F32_UBYTE2: 15782 case AMDGPUISD::CVT_F32_UBYTE3: 15783 return performCvtF32UByteNCombine(N, DCI); 15784 case AMDGPUISD::FMED3: 15785 return performFMed3Combine(N, DCI); 15786 case AMDGPUISD::CVT_PKRTZ_F16_F32: 15787 return performCvtPkRTZCombine(N, DCI); 15788 case AMDGPUISD::CLAMP: 15789 return performClampCombine(N, DCI); 15790 case ISD::SCALAR_TO_VECTOR: { 15791 SelectionDAG &DAG = DCI.DAG; 15792 EVT VT = N->getValueType(0); 15793 15794 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) 15795 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) { 15796 SDLoc SL(N); 15797 SDValue Src = N->getOperand(0); 15798 EVT EltVT = Src.getValueType(); 15799 if (EltVT != MVT::i16) 15800 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); 15801 15802 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); 15803 return DAG.getNode(ISD::BITCAST, SL, VT, Ext); 15804 } 15805 15806 break; 15807 } 15808 case ISD::EXTRACT_VECTOR_ELT: 15809 return performExtractVectorEltCombine(N, DCI); 15810 case ISD::INSERT_VECTOR_ELT: 15811 return performInsertVectorEltCombine(N, DCI); 15812 case ISD::FP_ROUND: 15813 return performFPRoundCombine(N, DCI); 15814 case ISD::LOAD: { 15815 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI)) 15816 return Widened; 15817 [[fallthrough]]; 15818 } 15819 default: { 15820 if (!DCI.isBeforeLegalize()) { 15821 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N)) 15822 return performMemSDNodeCombine(MemNode, DCI); 15823 } 15824 15825 break; 15826 } 15827 } 15828 15829 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 15830 } 15831 15832 /// Helper function for adjustWritemask 15833 static unsigned SubIdx2Lane(unsigned Idx) { 15834 switch (Idx) { 15835 default: 15836 return ~0u; 15837 case AMDGPU::sub0: 15838 return 0; 15839 case AMDGPU::sub1: 15840 return 1; 15841 case AMDGPU::sub2: 15842 return 2; 15843 case AMDGPU::sub3: 15844 return 3; 15845 case AMDGPU::sub4: 15846 return 4; // Possible with TFE/LWE 15847 } 15848 } 15849 15850 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions 15851 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, 15852 SelectionDAG &DAG) const { 15853 unsigned Opcode = Node->getMachineOpcode(); 15854 15855 // Subtract 1 because the vdata output is not a MachineSDNode operand. 15856 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; 15857 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) 15858 return Node; // not implemented for D16 15859 15860 SDNode *Users[5] = {nullptr}; 15861 unsigned Lane = 0; 15862 unsigned DmaskIdx = 15863 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; 15864 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 15865 unsigned NewDmask = 0; 15866 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; 15867 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; 15868 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || 15869 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)); 15870 unsigned TFCLane = 0; 15871 bool HasChain = Node->getNumValues() > 1; 15872 15873 if (OldDmask == 0) { 15874 // These are folded out, but on the chance it happens don't assert. 15875 return Node; 15876 } 15877 15878 unsigned OldBitsSet = llvm::popcount(OldDmask); 15879 // Work out which is the TFE/LWE lane if that is enabled. 15880 if (UsesTFC) { 15881 TFCLane = OldBitsSet; 15882 } 15883 15884 // Try to figure out the used register components 15885 for (SDUse &Use : Node->uses()) { 15886 15887 // Don't look at users of the chain. 15888 if (Use.getResNo() != 0) 15889 continue; 15890 15891 SDNode *User = Use.getUser(); 15892 15893 // Abort if we can't understand the usage 15894 if (!User->isMachineOpcode() || 15895 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 15896 return Node; 15897 15898 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. 15899 // Note that subregs are packed, i.e. Lane==0 is the first bit set 15900 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 15901 // set, etc. 15902 Lane = SubIdx2Lane(User->getConstantOperandVal(1)); 15903 if (Lane == ~0u) 15904 return Node; 15905 15906 // Check if the use is for the TFE/LWE generated result at VGPRn+1. 15907 if (UsesTFC && Lane == TFCLane) { 15908 Users[Lane] = User; 15909 } else { 15910 // Set which texture component corresponds to the lane. 15911 unsigned Comp; 15912 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { 15913 Comp = llvm::countr_zero(Dmask); 15914 Dmask &= ~(1 << Comp); 15915 } 15916 15917 // Abort if we have more than one user per component. 15918 if (Users[Lane]) 15919 return Node; 15920 15921 Users[Lane] = User; 15922 NewDmask |= 1 << Comp; 15923 } 15924 } 15925 15926 // Don't allow 0 dmask, as hardware assumes one channel enabled. 15927 bool NoChannels = !NewDmask; 15928 if (NoChannels) { 15929 if (!UsesTFC) { 15930 // No uses of the result and not using TFC. Then do nothing. 15931 return Node; 15932 } 15933 // If the original dmask has one channel - then nothing to do 15934 if (OldBitsSet == 1) 15935 return Node; 15936 // Use an arbitrary dmask - required for the instruction to work 15937 NewDmask = 1; 15938 } 15939 // Abort if there's no change 15940 if (NewDmask == OldDmask) 15941 return Node; 15942 15943 unsigned BitsSet = llvm::popcount(NewDmask); 15944 15945 // Check for TFE or LWE - increase the number of channels by one to account 15946 // for the extra return value 15947 // This will need adjustment for D16 if this is also included in 15948 // adjustWriteMask (this function) but at present D16 are excluded. 15949 unsigned NewChannels = BitsSet + UsesTFC; 15950 15951 int NewOpcode = 15952 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); 15953 assert(NewOpcode != -1 && 15954 NewOpcode != static_cast<int>(Node->getMachineOpcode()) && 15955 "failed to find equivalent MIMG op"); 15956 15957 // Adjust the writemask in the node 15958 SmallVector<SDValue, 12> Ops; 15959 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx)); 15960 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 15961 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1)); 15962 15963 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); 15964 15965 MVT ResultVT = NewChannels == 1 15966 ? SVT 15967 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 15968 : NewChannels == 5 ? 8 15969 : NewChannels); 15970 SDVTList NewVTList = 15971 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); 15972 15973 MachineSDNode *NewNode = 15974 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops); 15975 15976 if (HasChain) { 15977 // Update chain. 15978 DAG.setNodeMemRefs(NewNode, Node->memoperands()); 15979 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); 15980 } 15981 15982 if (NewChannels == 1) { 15983 assert(Node->hasNUsesOfValue(1, 0)); 15984 SDNode *Copy = 15985 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), 15986 Users[Lane]->getValueType(0), SDValue(NewNode, 0)); 15987 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 15988 return nullptr; 15989 } 15990 15991 // Update the users of the node with the new indices 15992 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { 15993 SDNode *User = Users[i]; 15994 if (!User) { 15995 // Handle the special case of NoChannels. We set NewDmask to 1 above, but 15996 // Users[0] is still nullptr because channel 0 doesn't really have a use. 15997 if (i || !NoChannels) 15998 continue; 15999 } else { 16000 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 16001 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); 16002 if (NewUser != User) { 16003 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0)); 16004 DAG.RemoveDeadNode(User); 16005 } 16006 } 16007 16008 switch (Idx) { 16009 default: 16010 break; 16011 case AMDGPU::sub0: 16012 Idx = AMDGPU::sub1; 16013 break; 16014 case AMDGPU::sub1: 16015 Idx = AMDGPU::sub2; 16016 break; 16017 case AMDGPU::sub2: 16018 Idx = AMDGPU::sub3; 16019 break; 16020 case AMDGPU::sub3: 16021 Idx = AMDGPU::sub4; 16022 break; 16023 } 16024 } 16025 16026 DAG.RemoveDeadNode(Node); 16027 return nullptr; 16028 } 16029 16030 static bool isFrameIndexOp(SDValue Op) { 16031 if (Op.getOpcode() == ISD::AssertZext) 16032 Op = Op.getOperand(0); 16033 16034 return isa<FrameIndexSDNode>(Op); 16035 } 16036 16037 /// Legalize target independent instructions (e.g. INSERT_SUBREG) 16038 /// with frame index operands. 16039 /// LLVM assumes that inputs are to these instructions are registers. 16040 SDNode * 16041 SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 16042 SelectionDAG &DAG) const { 16043 if (Node->getOpcode() == ISD::CopyToReg) { 16044 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); 16045 SDValue SrcVal = Node->getOperand(2); 16046 16047 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have 16048 // to try understanding copies to physical registers. 16049 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { 16050 SDLoc SL(Node); 16051 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 16052 SDValue VReg = DAG.getRegister( 16053 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); 16054 16055 SDNode *Glued = Node->getGluedNode(); 16056 SDValue ToVReg = DAG.getCopyToReg( 16057 Node->getOperand(0), SL, VReg, SrcVal, 16058 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); 16059 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), 16060 VReg, ToVReg.getValue(1)); 16061 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); 16062 DAG.RemoveDeadNode(Node); 16063 return ToResultReg.getNode(); 16064 } 16065 } 16066 16067 SmallVector<SDValue, 8> Ops; 16068 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 16069 if (!isFrameIndexOp(Node->getOperand(i))) { 16070 Ops.push_back(Node->getOperand(i)); 16071 continue; 16072 } 16073 16074 SDLoc DL(Node); 16075 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 16076 Node->getOperand(i).getValueType(), 16077 Node->getOperand(i)), 16078 0)); 16079 } 16080 16081 return DAG.UpdateNodeOperands(Node, Ops); 16082 } 16083 16084 /// Fold the instructions after selecting them. 16085 /// Returns null if users were already updated. 16086 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 16087 SelectionDAG &DAG) const { 16088 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 16089 unsigned Opcode = Node->getMachineOpcode(); 16090 16091 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && 16092 !TII->isGather4(Opcode) && 16093 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) { 16094 return adjustWritemask(Node, DAG); 16095 } 16096 16097 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) { 16098 legalizeTargetIndependentNode(Node, DAG); 16099 return Node; 16100 } 16101 16102 switch (Opcode) { 16103 case AMDGPU::V_DIV_SCALE_F32_e64: 16104 case AMDGPU::V_DIV_SCALE_F64_e64: { 16105 // Satisfy the operand register constraint when one of the inputs is 16106 // undefined. Ordinarily each undef value will have its own implicit_def of 16107 // a vreg, so force these to use a single register. 16108 SDValue Src0 = Node->getOperand(1); 16109 SDValue Src1 = Node->getOperand(3); 16110 SDValue Src2 = Node->getOperand(5); 16111 16112 if ((Src0.isMachineOpcode() && 16113 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && 16114 (Src0 == Src1 || Src0 == Src2)) 16115 break; 16116 16117 MVT VT = Src0.getValueType().getSimpleVT(); 16118 const TargetRegisterClass *RC = 16119 getRegClassFor(VT, Src0.getNode()->isDivergent()); 16120 16121 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 16122 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); 16123 16124 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg, 16125 Src0, SDValue()); 16126 16127 // src0 must be the same register as src1 or src2, even if the value is 16128 // undefined, so make sure we don't violate this constraint. 16129 if (Src0.isMachineOpcode() && 16130 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { 16131 if (Src1.isMachineOpcode() && 16132 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 16133 Src0 = Src1; 16134 else if (Src2.isMachineOpcode() && 16135 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 16136 Src0 = Src2; 16137 else { 16138 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); 16139 Src0 = UndefReg; 16140 Src1 = UndefReg; 16141 } 16142 } else 16143 break; 16144 16145 SmallVector<SDValue, 9> Ops(Node->ops()); 16146 Ops[1] = Src0; 16147 Ops[3] = Src1; 16148 Ops[5] = Src2; 16149 Ops.push_back(ImpDef.getValue(1)); 16150 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 16151 } 16152 default: 16153 break; 16154 } 16155 16156 return Node; 16157 } 16158 16159 // Any MIMG instructions that use tfe or lwe require an initialization of the 16160 // result register that will be written in the case of a memory access failure. 16161 // The required code is also added to tie this init code to the result of the 16162 // img instruction. 16163 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { 16164 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 16165 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 16166 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); 16167 MachineBasicBlock &MBB = *MI.getParent(); 16168 16169 int DstIdx = 16170 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 16171 unsigned InitIdx = 0; 16172 16173 if (TII->isImage(MI)) { 16174 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); 16175 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); 16176 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); 16177 16178 if (!TFE && !LWE) // intersect_ray 16179 return; 16180 16181 unsigned TFEVal = TFE ? TFE->getImm() : 0; 16182 unsigned LWEVal = LWE ? LWE->getImm() : 0; 16183 unsigned D16Val = D16 ? D16->getImm() : 0; 16184 16185 if (!TFEVal && !LWEVal) 16186 return; 16187 16188 // At least one of TFE or LWE are non-zero 16189 // We have to insert a suitable initialization of the result value and 16190 // tie this to the dest of the image instruction. 16191 16192 // Calculate which dword we have to initialize to 0. 16193 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); 16194 16195 // check that dmask operand is found. 16196 assert(MO_Dmask && "Expected dmask operand in instruction"); 16197 16198 unsigned dmask = MO_Dmask->getImm(); 16199 // Determine the number of active lanes taking into account the 16200 // Gather4 special case 16201 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask); 16202 16203 bool Packed = !Subtarget->hasUnpackedD16VMem(); 16204 16205 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; 16206 16207 // Abandon attempt if the dst size isn't large enough 16208 // - this is in fact an error but this is picked up elsewhere and 16209 // reported correctly. 16210 uint32_t DstSize = 16211 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; 16212 if (DstSize < InitIdx) 16213 return; 16214 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { 16215 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; 16216 } else { 16217 return; 16218 } 16219 16220 const DebugLoc &DL = MI.getDebugLoc(); 16221 16222 // Create a register for the initialization value. 16223 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg()); 16224 unsigned NewDst = 0; // Final initialized value will be in here 16225 16226 // If PRTStrictNull feature is enabled (the default) then initialize 16227 // all the result registers to 0, otherwise just the error indication 16228 // register (VGPRn+1) 16229 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; 16230 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); 16231 16232 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); 16233 for (; SizeLeft; SizeLeft--, CurrIdx++) { 16234 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); 16235 // Initialize dword 16236 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 16237 // clang-format off 16238 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) 16239 .addImm(0); 16240 // clang-format on 16241 // Insert into the super-reg 16242 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) 16243 .addReg(PrevDst) 16244 .addReg(SubReg) 16245 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); 16246 16247 PrevDst = NewDst; 16248 } 16249 16250 // Add as an implicit operand 16251 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true)); 16252 16253 // Tie the just added implicit operand to the dst 16254 MI.tieOperands(DstIdx, MI.getNumOperands() - 1); 16255 } 16256 16257 /// Assign the register class depending on the number of 16258 /// bits set in the writemask 16259 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 16260 SDNode *Node) const { 16261 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 16262 16263 MachineFunction *MF = MI.getParent()->getParent(); 16264 MachineRegisterInfo &MRI = MF->getRegInfo(); 16265 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 16266 16267 if (TII->isVOP3(MI.getOpcode())) { 16268 // Make sure constant bus requirements are respected. 16269 TII->legalizeOperandsVOP3(MRI, MI); 16270 16271 // Prefer VGPRs over AGPRs in mAI instructions where possible. 16272 // This saves a chain-copy of registers and better balance register 16273 // use between vgpr and agpr as agpr tuples tend to be big. 16274 if (!MI.getDesc().operands().empty()) { 16275 unsigned Opc = MI.getOpcode(); 16276 bool HasAGPRs = Info->mayNeedAGPRs(); 16277 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 16278 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 16279 for (auto I : 16280 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 16281 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { 16282 if (I == -1) 16283 break; 16284 if ((I == Src2Idx) && (HasAGPRs)) 16285 break; 16286 MachineOperand &Op = MI.getOperand(I); 16287 if (!Op.isReg() || !Op.getReg().isVirtual()) 16288 continue; 16289 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); 16290 if (!TRI->hasAGPRs(RC)) 16291 continue; 16292 auto *Src = MRI.getUniqueVRegDef(Op.getReg()); 16293 if (!Src || !Src->isCopy() || 16294 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) 16295 continue; 16296 auto *NewRC = TRI->getEquivalentVGPRClass(RC); 16297 // All uses of agpr64 and agpr32 can also accept vgpr except for 16298 // v_accvgpr_read, but we do not produce agpr reads during selection, 16299 // so no use checks are needed. 16300 MRI.setRegClass(Op.getReg(), NewRC); 16301 } 16302 16303 if (TII->isMAI(MI)) { 16304 // The ordinary src0, src1, src2 were legalized above. 16305 // 16306 // We have to also legalize the appended v_mfma_ld_scale_b32 operands, 16307 // as a separate instruction. 16308 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 16309 AMDGPU::OpName::scale_src0); 16310 if (Src0Idx != -1) { 16311 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 16312 AMDGPU::OpName::scale_src1); 16313 if (TII->usesConstantBus(MRI, MI, Src0Idx) && 16314 TII->usesConstantBus(MRI, MI, Src1Idx)) 16315 TII->legalizeOpWithMove(MI, Src1Idx); 16316 } 16317 } 16318 16319 if (!HasAGPRs) 16320 return; 16321 16322 // Resolve the rest of AV operands to AGPRs. 16323 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { 16324 if (Src2->isReg() && Src2->getReg().isVirtual()) { 16325 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); 16326 if (TRI->isVectorSuperClass(RC)) { 16327 auto *NewRC = TRI->getEquivalentAGPRClass(RC); 16328 MRI.setRegClass(Src2->getReg(), NewRC); 16329 if (Src2->isTied()) 16330 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); 16331 } 16332 } 16333 } 16334 } 16335 16336 return; 16337 } 16338 16339 if (TII->isImage(MI)) 16340 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); 16341 } 16342 16343 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 16344 uint64_t Val) { 16345 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 16346 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 16347 } 16348 16349 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 16350 const SDLoc &DL, 16351 SDValue Ptr) const { 16352 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 16353 16354 // Build the half of the subregister with the constants before building the 16355 // full 128-bit register. If we are building multiple resource descriptors, 16356 // this will allow CSEing of the 2-component register. 16357 const SDValue Ops0[] = { 16358 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 16359 buildSMovImm32(DAG, DL, 0), 16360 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 16361 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 16362 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 16363 16364 SDValue SubRegHi = SDValue( 16365 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0); 16366 16367 // Combine the constants and the pointer. 16368 const SDValue Ops1[] = { 16369 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr, 16370 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, 16371 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)}; 16372 16373 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 16374 } 16375 16376 /// Return a resource descriptor with the 'Add TID' bit enabled 16377 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 16378 /// of the resource descriptor) to create an offset, which is added to 16379 /// the resource pointer. 16380 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 16381 SDValue Ptr, uint32_t RsrcDword1, 16382 uint64_t RsrcDword2And3) const { 16383 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 16384 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 16385 if (RsrcDword1) { 16386 PtrHi = 16387 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 16388 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 16389 0); 16390 } 16391 16392 SDValue DataLo = 16393 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 16394 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 16395 16396 const SDValue Ops[] = { 16397 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), 16398 PtrLo, 16399 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 16400 PtrHi, 16401 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 16402 DataLo, 16403 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 16404 DataHi, 16405 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)}; 16406 16407 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 16408 } 16409 16410 //===----------------------------------------------------------------------===// 16411 // SI Inline Assembly Support 16412 //===----------------------------------------------------------------------===// 16413 16414 std::pair<unsigned, const TargetRegisterClass *> 16415 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, 16416 StringRef Constraint, 16417 MVT VT) const { 16418 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); 16419 16420 const TargetRegisterClass *RC = nullptr; 16421 if (Constraint.size() == 1) { 16422 const unsigned BitWidth = VT.getSizeInBits(); 16423 switch (Constraint[0]) { 16424 default: 16425 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 16426 case 's': 16427 case 'r': 16428 switch (BitWidth) { 16429 case 16: 16430 RC = &AMDGPU::SReg_32RegClass; 16431 break; 16432 case 64: 16433 RC = &AMDGPU::SGPR_64RegClass; 16434 break; 16435 default: 16436 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); 16437 if (!RC) 16438 return std::pair(0U, nullptr); 16439 break; 16440 } 16441 break; 16442 case 'v': 16443 switch (BitWidth) { 16444 case 16: 16445 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass 16446 : &AMDGPU::VGPR_32RegClass; 16447 break; 16448 default: 16449 RC = TRI->getVGPRClassForBitWidth(BitWidth); 16450 if (!RC) 16451 return std::pair(0U, nullptr); 16452 break; 16453 } 16454 break; 16455 case 'a': 16456 if (!Subtarget->hasMAIInsts()) 16457 break; 16458 switch (BitWidth) { 16459 case 16: 16460 RC = &AMDGPU::AGPR_32RegClass; 16461 break; 16462 default: 16463 RC = TRI->getAGPRClassForBitWidth(BitWidth); 16464 if (!RC) 16465 return std::pair(0U, nullptr); 16466 break; 16467 } 16468 break; 16469 } 16470 // We actually support i128, i16 and f16 as inline parameters 16471 // even if they are not reported as legal 16472 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || 16473 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) 16474 return std::pair(0U, RC); 16475 } 16476 16477 if (Constraint.starts_with("{") && Constraint.ends_with("}")) { 16478 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); 16479 if (RegName.consume_front("v")) { 16480 RC = &AMDGPU::VGPR_32RegClass; 16481 } else if (RegName.consume_front("s")) { 16482 RC = &AMDGPU::SGPR_32RegClass; 16483 } else if (RegName.consume_front("a")) { 16484 RC = &AMDGPU::AGPR_32RegClass; 16485 } 16486 16487 if (RC) { 16488 uint32_t Idx; 16489 if (RegName.consume_front("[")) { 16490 uint32_t End; 16491 bool Failed = RegName.consumeInteger(10, Idx); 16492 Failed |= !RegName.consume_front(":"); 16493 Failed |= RegName.consumeInteger(10, End); 16494 Failed |= !RegName.consume_back("]"); 16495 if (!Failed) { 16496 uint32_t Width = (End - Idx + 1) * 32; 16497 // Prohibit constraints for register ranges with a width that does not 16498 // match the required type. 16499 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits()) 16500 return std::pair(0U, nullptr); 16501 MCRegister Reg = RC->getRegister(Idx); 16502 if (SIRegisterInfo::isVGPRClass(RC)) 16503 RC = TRI->getVGPRClassForBitWidth(Width); 16504 else if (SIRegisterInfo::isSGPRClass(RC)) 16505 RC = TRI->getSGPRClassForBitWidth(Width); 16506 else if (SIRegisterInfo::isAGPRClass(RC)) 16507 RC = TRI->getAGPRClassForBitWidth(Width); 16508 if (RC) { 16509 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); 16510 if (!Reg) { 16511 // The register class does not contain the requested register, 16512 // e.g., because it is an SGPR pair that would violate alignment 16513 // requirements. 16514 return std::pair(0U, nullptr); 16515 } 16516 return std::pair(Reg, RC); 16517 } 16518 } 16519 } else { 16520 // Check for lossy scalar/vector conversions. 16521 if (VT.isVector() && VT.getSizeInBits() != 32) 16522 return std::pair(0U, nullptr); 16523 bool Failed = RegName.getAsInteger(10, Idx); 16524 if (!Failed && Idx < RC->getNumRegs()) 16525 return std::pair(RC->getRegister(Idx), RC); 16526 } 16527 } 16528 } 16529 16530 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 16531 if (Ret.first) 16532 Ret.second = TRI->getPhysRegBaseClass(Ret.first); 16533 16534 return Ret; 16535 } 16536 16537 static bool isImmConstraint(StringRef Constraint) { 16538 if (Constraint.size() == 1) { 16539 switch (Constraint[0]) { 16540 default: 16541 break; 16542 case 'I': 16543 case 'J': 16544 case 'A': 16545 case 'B': 16546 case 'C': 16547 return true; 16548 } 16549 } else if (Constraint == "DA" || Constraint == "DB") { 16550 return true; 16551 } 16552 return false; 16553 } 16554 16555 SITargetLowering::ConstraintType 16556 SITargetLowering::getConstraintType(StringRef Constraint) const { 16557 if (Constraint.size() == 1) { 16558 switch (Constraint[0]) { 16559 default: 16560 break; 16561 case 's': 16562 case 'v': 16563 case 'a': 16564 return C_RegisterClass; 16565 } 16566 } 16567 if (isImmConstraint(Constraint)) { 16568 return C_Other; 16569 } 16570 return TargetLowering::getConstraintType(Constraint); 16571 } 16572 16573 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { 16574 if (!AMDGPU::isInlinableIntLiteral(Val)) { 16575 Val = Val & maskTrailingOnes<uint64_t>(Size); 16576 } 16577 return Val; 16578 } 16579 16580 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16581 StringRef Constraint, 16582 std::vector<SDValue> &Ops, 16583 SelectionDAG &DAG) const { 16584 if (isImmConstraint(Constraint)) { 16585 uint64_t Val; 16586 if (getAsmOperandConstVal(Op, Val) && 16587 checkAsmConstraintVal(Op, Constraint, Val)) { 16588 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits()); 16589 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64)); 16590 } 16591 } else { 16592 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16593 } 16594 } 16595 16596 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { 16597 unsigned Size = Op.getScalarValueSizeInBits(); 16598 if (Size > 64) 16599 return false; 16600 16601 if (Size == 16 && !Subtarget->has16BitInsts()) 16602 return false; 16603 16604 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16605 Val = C->getSExtValue(); 16606 return true; 16607 } 16608 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) { 16609 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); 16610 return true; 16611 } 16612 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) { 16613 if (Size != 16 || Op.getNumOperands() != 2) 16614 return false; 16615 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef()) 16616 return false; 16617 if (ConstantSDNode *C = V->getConstantSplatNode()) { 16618 Val = C->getSExtValue(); 16619 return true; 16620 } 16621 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { 16622 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); 16623 return true; 16624 } 16625 } 16626 16627 return false; 16628 } 16629 16630 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, 16631 uint64_t Val) const { 16632 if (Constraint.size() == 1) { 16633 switch (Constraint[0]) { 16634 case 'I': 16635 return AMDGPU::isInlinableIntLiteral(Val); 16636 case 'J': 16637 return isInt<16>(Val); 16638 case 'A': 16639 return checkAsmConstraintValA(Op, Val); 16640 case 'B': 16641 return isInt<32>(Val); 16642 case 'C': 16643 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) || 16644 AMDGPU::isInlinableIntLiteral(Val); 16645 default: 16646 break; 16647 } 16648 } else if (Constraint.size() == 2) { 16649 if (Constraint == "DA") { 16650 int64_t HiBits = static_cast<int32_t>(Val >> 32); 16651 int64_t LoBits = static_cast<int32_t>(Val); 16652 return checkAsmConstraintValA(Op, HiBits, 32) && 16653 checkAsmConstraintValA(Op, LoBits, 32); 16654 } 16655 if (Constraint == "DB") { 16656 return true; 16657 } 16658 } 16659 llvm_unreachable("Invalid asm constraint"); 16660 } 16661 16662 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, 16663 unsigned MaxSize) const { 16664 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize); 16665 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); 16666 if (Size == 16) { 16667 MVT VT = Op.getSimpleValueType(); 16668 switch (VT.SimpleTy) { 16669 default: 16670 return false; 16671 case MVT::i16: 16672 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi); 16673 case MVT::f16: 16674 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); 16675 case MVT::bf16: 16676 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); 16677 case MVT::v2i16: 16678 return AMDGPU::getInlineEncodingV2I16(Val).has_value(); 16679 case MVT::v2f16: 16680 return AMDGPU::getInlineEncodingV2F16(Val).has_value(); 16681 case MVT::v2bf16: 16682 return AMDGPU::getInlineEncodingV2BF16(Val).has_value(); 16683 } 16684 } 16685 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || 16686 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) 16687 return true; 16688 return false; 16689 } 16690 16691 static int getAlignedAGPRClassID(unsigned UnalignedClassID) { 16692 switch (UnalignedClassID) { 16693 case AMDGPU::VReg_64RegClassID: 16694 return AMDGPU::VReg_64_Align2RegClassID; 16695 case AMDGPU::VReg_96RegClassID: 16696 return AMDGPU::VReg_96_Align2RegClassID; 16697 case AMDGPU::VReg_128RegClassID: 16698 return AMDGPU::VReg_128_Align2RegClassID; 16699 case AMDGPU::VReg_160RegClassID: 16700 return AMDGPU::VReg_160_Align2RegClassID; 16701 case AMDGPU::VReg_192RegClassID: 16702 return AMDGPU::VReg_192_Align2RegClassID; 16703 case AMDGPU::VReg_224RegClassID: 16704 return AMDGPU::VReg_224_Align2RegClassID; 16705 case AMDGPU::VReg_256RegClassID: 16706 return AMDGPU::VReg_256_Align2RegClassID; 16707 case AMDGPU::VReg_288RegClassID: 16708 return AMDGPU::VReg_288_Align2RegClassID; 16709 case AMDGPU::VReg_320RegClassID: 16710 return AMDGPU::VReg_320_Align2RegClassID; 16711 case AMDGPU::VReg_352RegClassID: 16712 return AMDGPU::VReg_352_Align2RegClassID; 16713 case AMDGPU::VReg_384RegClassID: 16714 return AMDGPU::VReg_384_Align2RegClassID; 16715 case AMDGPU::VReg_512RegClassID: 16716 return AMDGPU::VReg_512_Align2RegClassID; 16717 case AMDGPU::VReg_1024RegClassID: 16718 return AMDGPU::VReg_1024_Align2RegClassID; 16719 case AMDGPU::AReg_64RegClassID: 16720 return AMDGPU::AReg_64_Align2RegClassID; 16721 case AMDGPU::AReg_96RegClassID: 16722 return AMDGPU::AReg_96_Align2RegClassID; 16723 case AMDGPU::AReg_128RegClassID: 16724 return AMDGPU::AReg_128_Align2RegClassID; 16725 case AMDGPU::AReg_160RegClassID: 16726 return AMDGPU::AReg_160_Align2RegClassID; 16727 case AMDGPU::AReg_192RegClassID: 16728 return AMDGPU::AReg_192_Align2RegClassID; 16729 case AMDGPU::AReg_256RegClassID: 16730 return AMDGPU::AReg_256_Align2RegClassID; 16731 case AMDGPU::AReg_512RegClassID: 16732 return AMDGPU::AReg_512_Align2RegClassID; 16733 case AMDGPU::AReg_1024RegClassID: 16734 return AMDGPU::AReg_1024_Align2RegClassID; 16735 default: 16736 return -1; 16737 } 16738 } 16739 16740 // Figure out which registers should be reserved for stack access. Only after 16741 // the function is legalized do we know all of the non-spill stack objects or if 16742 // calls are present. 16743 void SITargetLowering::finalizeLowering(MachineFunction &MF) const { 16744 MachineRegisterInfo &MRI = MF.getRegInfo(); 16745 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 16746 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 16747 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 16748 const SIInstrInfo *TII = ST.getInstrInfo(); 16749 16750 if (Info->isEntryFunction()) { 16751 // Callable functions have fixed registers used for stack access. 16752 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); 16753 } 16754 16755 // TODO: Move this logic to getReservedRegs() 16756 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. 16757 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 16758 Register SReg = ST.isWave32() 16759 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) 16760 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, 16761 &AMDGPU::SGPR_64RegClass); 16762 Info->setSGPRForEXECCopy(SReg); 16763 16764 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), 16765 Info->getStackPtrOffsetReg())); 16766 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) 16767 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); 16768 16769 // We need to worry about replacing the default register with itself in case 16770 // of MIR testcases missing the MFI. 16771 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) 16772 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); 16773 16774 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) 16775 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); 16776 16777 Info->limitOccupancy(MF); 16778 16779 if (ST.isWave32() && !MF.empty()) { 16780 for (auto &MBB : MF) { 16781 for (auto &MI : MBB) { 16782 TII->fixImplicitOperands(MI); 16783 } 16784 } 16785 } 16786 16787 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned 16788 // classes if required. Ideally the register class constraints would differ 16789 // per-subtarget, but there's no easy way to achieve that right now. This is 16790 // not a problem for VGPRs because the correctly aligned VGPR class is implied 16791 // from using them as the register class for legal types. 16792 if (ST.needsAlignedVGPRs()) { 16793 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { 16794 const Register Reg = Register::index2VirtReg(I); 16795 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); 16796 if (!RC) 16797 continue; 16798 int NewClassID = getAlignedAGPRClassID(RC->getID()); 16799 if (NewClassID != -1) 16800 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); 16801 } 16802 } 16803 16804 TargetLoweringBase::finalizeLowering(MF); 16805 } 16806 16807 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 16808 KnownBits &Known, 16809 const APInt &DemandedElts, 16810 const SelectionDAG &DAG, 16811 unsigned Depth) const { 16812 Known.resetAll(); 16813 unsigned Opc = Op.getOpcode(); 16814 switch (Opc) { 16815 case ISD::INTRINSIC_WO_CHAIN: { 16816 unsigned IID = Op.getConstantOperandVal(0); 16817 switch (IID) { 16818 case Intrinsic::amdgcn_mbcnt_lo: 16819 case Intrinsic::amdgcn_mbcnt_hi: { 16820 const GCNSubtarget &ST = 16821 DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); 16822 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at 16823 // most 31 + src1. 16824 Known.Zero.setBitsFrom( 16825 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5); 16826 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 16827 Known = KnownBits::add(Known, Known2); 16828 return; 16829 } 16830 } 16831 break; 16832 } 16833 } 16834 return AMDGPUTargetLowering::computeKnownBitsForTargetNode( 16835 Op, Known, DemandedElts, DAG, Depth); 16836 } 16837 16838 void SITargetLowering::computeKnownBitsForFrameIndex( 16839 const int FI, KnownBits &Known, const MachineFunction &MF) const { 16840 TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF); 16841 16842 // Set the high bits to zero based on the maximum allowed scratch size per 16843 // wave. We can't use vaddr in MUBUF instructions if we don't know the address 16844 // calculation won't overflow, so assume the sign bit is never set. 16845 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); 16846 } 16847 16848 static void knownBitsForWorkitemID(const GCNSubtarget &ST, 16849 GISelValueTracking &VT, KnownBits &Known, 16850 unsigned Dim) { 16851 unsigned MaxValue = 16852 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim); 16853 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 16854 } 16855 16856 void SITargetLowering::computeKnownBitsForTargetInstr( 16857 GISelValueTracking &VT, Register R, KnownBits &Known, 16858 const APInt &DemandedElts, const MachineRegisterInfo &MRI, 16859 unsigned Depth) const { 16860 const MachineInstr *MI = MRI.getVRegDef(R); 16861 switch (MI->getOpcode()) { 16862 case AMDGPU::G_INTRINSIC: 16863 case AMDGPU::G_INTRINSIC_CONVERGENT: { 16864 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID(); 16865 switch (IID) { 16866 case Intrinsic::amdgcn_workitem_id_x: 16867 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0); 16868 break; 16869 case Intrinsic::amdgcn_workitem_id_y: 16870 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1); 16871 break; 16872 case Intrinsic::amdgcn_workitem_id_z: 16873 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2); 16874 break; 16875 case Intrinsic::amdgcn_mbcnt_lo: 16876 case Intrinsic::amdgcn_mbcnt_hi: { 16877 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at 16878 // most 31 + src1. 16879 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo 16880 ? getSubtarget()->getWavefrontSizeLog2() 16881 : 5); 16882 KnownBits Known2; 16883 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts, 16884 Depth + 1); 16885 Known = KnownBits::add(Known, Known2); 16886 break; 16887 } 16888 case Intrinsic::amdgcn_groupstaticsize: { 16889 // We can report everything over the maximum size as 0. We can't report 16890 // based on the actual size because we don't know if it's accurate or not 16891 // at any given point. 16892 Known.Zero.setHighBits( 16893 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize())); 16894 break; 16895 } 16896 } 16897 break; 16898 } 16899 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 16900 Known.Zero.setHighBits(24); 16901 break; 16902 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 16903 Known.Zero.setHighBits(16); 16904 break; 16905 case AMDGPU::G_AMDGPU_SMED3: 16906 case AMDGPU::G_AMDGPU_UMED3: { 16907 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 16908 16909 KnownBits Known2; 16910 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1); 16911 if (Known2.isUnknown()) 16912 break; 16913 16914 KnownBits Known1; 16915 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1); 16916 if (Known1.isUnknown()) 16917 break; 16918 16919 KnownBits Known0; 16920 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1); 16921 if (Known0.isUnknown()) 16922 break; 16923 16924 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 16925 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 16926 Known.One = Known0.One & Known1.One & Known2.One; 16927 break; 16928 } 16929 } 16930 } 16931 16932 Align SITargetLowering::computeKnownAlignForTargetInstr( 16933 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI, 16934 unsigned Depth) const { 16935 const MachineInstr *MI = MRI.getVRegDef(R); 16936 if (auto *GI = dyn_cast<GIntrinsic>(MI)) { 16937 // FIXME: Can this move to generic code? What about the case where the call 16938 // site specifies a lower alignment? 16939 Intrinsic::ID IID = GI->getIntrinsicID(); 16940 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext(); 16941 AttributeList Attrs = 16942 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID)); 16943 if (MaybeAlign RetAlign = Attrs.getRetAlignment()) 16944 return *RetAlign; 16945 } 16946 return Align(1); 16947 } 16948 16949 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 16950 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); 16951 const Align CacheLineAlign = Align(64); 16952 16953 // Pre-GFX10 target did not benefit from loop alignment 16954 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || 16955 getSubtarget()->hasInstFwdPrefetchBug()) 16956 return PrefAlign; 16957 16958 // On GFX10 I$ is 4 x 64 bytes cache lines. 16959 // By default prefetcher keeps one cache line behind and reads two ahead. 16960 // We can modify it with S_INST_PREFETCH for larger loops to have two lines 16961 // behind and one ahead. 16962 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. 16963 // If loop fits 64 bytes it always spans no more than two cache lines and 16964 // does not need an alignment. 16965 // Else if loop is less or equal 128 bytes we do not need to modify prefetch, 16966 // Else if loop is less or equal 192 bytes we need two lines behind. 16967 16968 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 16969 const MachineBasicBlock *Header = ML->getHeader(); 16970 if (Header->getAlignment() != PrefAlign) 16971 return Header->getAlignment(); // Already processed. 16972 16973 unsigned LoopSize = 0; 16974 for (const MachineBasicBlock *MBB : ML->blocks()) { 16975 // If inner loop block is aligned assume in average half of the alignment 16976 // size to be added as nops. 16977 if (MBB != Header) 16978 LoopSize += MBB->getAlignment().value() / 2; 16979 16980 for (const MachineInstr &MI : *MBB) { 16981 LoopSize += TII->getInstSizeInBytes(MI); 16982 if (LoopSize > 192) 16983 return PrefAlign; 16984 } 16985 } 16986 16987 if (LoopSize <= 64) 16988 return PrefAlign; 16989 16990 if (LoopSize <= 128) 16991 return CacheLineAlign; 16992 16993 // If any of parent loops is surrounded by prefetch instructions do not 16994 // insert new for inner loop, which would reset parent's settings. 16995 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { 16996 if (MachineBasicBlock *Exit = P->getExitBlock()) { 16997 auto I = Exit->getFirstNonDebugInstr(); 16998 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) 16999 return CacheLineAlign; 17000 } 17001 } 17002 17003 MachineBasicBlock *Pre = ML->getLoopPreheader(); 17004 MachineBasicBlock *Exit = ML->getExitBlock(); 17005 17006 if (Pre && Exit) { 17007 auto PreTerm = Pre->getFirstTerminator(); 17008 if (PreTerm == Pre->begin() || 17009 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) 17010 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) 17011 .addImm(1); // prefetch 2 lines behind PC 17012 17013 auto ExitHead = Exit->getFirstNonDebugInstr(); 17014 if (ExitHead == Exit->end() || 17015 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) 17016 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) 17017 .addImm(2); // prefetch 1 line behind PC 17018 } 17019 17020 return CacheLineAlign; 17021 } 17022 17023 LLVM_ATTRIBUTE_UNUSED 17024 static bool isCopyFromRegOfInlineAsm(const SDNode *N) { 17025 assert(N->getOpcode() == ISD::CopyFromReg); 17026 do { 17027 // Follow the chain until we find an INLINEASM node. 17028 N = N->getOperand(0).getNode(); 17029 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR) 17030 return true; 17031 } while (N->getOpcode() == ISD::CopyFromReg); 17032 return false; 17033 } 17034 17035 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, 17036 FunctionLoweringInfo *FLI, 17037 UniformityInfo *UA) const { 17038 switch (N->getOpcode()) { 17039 case ISD::CopyFromReg: { 17040 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); 17041 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); 17042 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17043 Register Reg = R->getReg(); 17044 17045 // FIXME: Why does this need to consider isLiveIn? 17046 if (Reg.isPhysical() || MRI.isLiveIn(Reg)) 17047 return !TRI->isSGPRReg(MRI, Reg); 17048 17049 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) 17050 return UA->isDivergent(V); 17051 17052 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); 17053 return !TRI->isSGPRReg(MRI, Reg); 17054 } 17055 case ISD::LOAD: { 17056 const LoadSDNode *L = cast<LoadSDNode>(N); 17057 unsigned AS = L->getAddressSpace(); 17058 // A flat load may access private memory. 17059 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 17060 } 17061 case ISD::CALLSEQ_END: 17062 return true; 17063 case ISD::INTRINSIC_WO_CHAIN: 17064 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0)); 17065 case ISD::INTRINSIC_W_CHAIN: 17066 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); 17067 case AMDGPUISD::ATOMIC_CMP_SWAP: 17068 case AMDGPUISD::BUFFER_ATOMIC_SWAP: 17069 case AMDGPUISD::BUFFER_ATOMIC_ADD: 17070 case AMDGPUISD::BUFFER_ATOMIC_SUB: 17071 case AMDGPUISD::BUFFER_ATOMIC_SMIN: 17072 case AMDGPUISD::BUFFER_ATOMIC_UMIN: 17073 case AMDGPUISD::BUFFER_ATOMIC_SMAX: 17074 case AMDGPUISD::BUFFER_ATOMIC_UMAX: 17075 case AMDGPUISD::BUFFER_ATOMIC_AND: 17076 case AMDGPUISD::BUFFER_ATOMIC_OR: 17077 case AMDGPUISD::BUFFER_ATOMIC_XOR: 17078 case AMDGPUISD::BUFFER_ATOMIC_INC: 17079 case AMDGPUISD::BUFFER_ATOMIC_DEC: 17080 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: 17081 case AMDGPUISD::BUFFER_ATOMIC_CSUB: 17082 case AMDGPUISD::BUFFER_ATOMIC_FADD: 17083 case AMDGPUISD::BUFFER_ATOMIC_FMIN: 17084 case AMDGPUISD::BUFFER_ATOMIC_FMAX: 17085 // Target-specific read-modify-write atomics are sources of divergence. 17086 return true; 17087 default: 17088 if (auto *A = dyn_cast<AtomicSDNode>(N)) { 17089 // Generic read-modify-write atomics are sources of divergence. 17090 return A->readMem() && A->writeMem(); 17091 } 17092 return false; 17093 } 17094 } 17095 17096 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, 17097 EVT VT) const { 17098 switch (VT.getScalarType().getSimpleVT().SimpleTy) { 17099 case MVT::f32: 17100 return !denormalModeIsFlushAllF32(DAG.getMachineFunction()); 17101 case MVT::f64: 17102 case MVT::f16: 17103 return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); 17104 default: 17105 return false; 17106 } 17107 } 17108 17109 bool SITargetLowering::denormalsEnabledForType( 17110 LLT Ty, const MachineFunction &MF) const { 17111 switch (Ty.getScalarSizeInBits()) { 17112 case 32: 17113 return !denormalModeIsFlushAllF32(MF); 17114 case 64: 17115 case 16: 17116 return !denormalModeIsFlushAllF64F16(MF); 17117 default: 17118 return false; 17119 } 17120 } 17121 17122 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 17123 const APInt &DemandedElts, 17124 const SelectionDAG &DAG, 17125 bool SNaN, 17126 unsigned Depth) const { 17127 if (Op.getOpcode() == AMDGPUISD::CLAMP) { 17128 const MachineFunction &MF = DAG.getMachineFunction(); 17129 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 17130 17131 if (Info->getMode().DX10Clamp) 17132 return true; // Clamped to 0. 17133 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 17134 } 17135 17136 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts, 17137 DAG, SNaN, Depth); 17138 } 17139 17140 // On older subtargets, global FP atomic instructions have a hardcoded FP mode 17141 // and do not support FP32 denormals, and only support v2f16/f64 denormals. 17142 static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) { 17143 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode")) 17144 return true; 17145 17146 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); 17147 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt); 17148 if (DenormMode == DenormalMode::getPreserveSign()) 17149 return true; 17150 17151 // TODO: Remove this. 17152 return RMW->getFunction() 17153 ->getFnAttribute("amdgpu-unsafe-fp-atomics") 17154 .getValueAsBool(); 17155 } 17156 17157 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { 17158 LLVMContext &Ctx = RMW->getContext(); 17159 StringRef MemScope = 17160 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system"); 17161 17162 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) 17163 << "Hardware instruction generated for atomic " 17164 << RMW->getOperationName(RMW->getOperation()) 17165 << " operation at memory scope " << MemScope; 17166 } 17167 17168 static bool isV2F16OrV2BF16(Type *Ty) { 17169 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 17170 Type *EltTy = VT->getElementType(); 17171 return VT->getNumElements() == 2 && 17172 (EltTy->isHalfTy() || EltTy->isBFloatTy()); 17173 } 17174 17175 return false; 17176 } 17177 17178 static bool isV2F16(Type *Ty) { 17179 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); 17180 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); 17181 } 17182 17183 static bool isV2BF16(Type *Ty) { 17184 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); 17185 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); 17186 } 17187 17188 /// \return true if atomicrmw integer ops work for the type. 17189 static bool isAtomicRMWLegalIntTy(Type *Ty) { 17190 if (auto *IT = dyn_cast<IntegerType>(Ty)) { 17191 unsigned BW = IT->getBitWidth(); 17192 return BW == 32 || BW == 64; 17193 } 17194 17195 return false; 17196 } 17197 17198 /// \return true if this atomicrmw xchg type can be selected. 17199 static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) { 17200 Type *Ty = RMW->getType(); 17201 if (isAtomicRMWLegalIntTy(Ty)) 17202 return true; 17203 17204 if (PointerType *PT = dyn_cast<PointerType>(Ty)) { 17205 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout(); 17206 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace()); 17207 return BW == 32 || BW == 64; 17208 } 17209 17210 if (Ty->isFloatTy() || Ty->isDoubleTy()) 17211 return true; 17212 17213 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) { 17214 return VT->getNumElements() == 2 && 17215 VT->getElementType()->getPrimitiveSizeInBits() == 16; 17216 } 17217 17218 return false; 17219 } 17220 17221 /// \returns true if it's valid to emit a native instruction for \p RMW, based 17222 /// on the properties of the target memory. 17223 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, 17224 const AtomicRMWInst *RMW, 17225 bool HasSystemScope) { 17226 // The remote/fine-grained access logic is different from the integer 17227 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support, 17228 // fine-grained access does not work, even for a device local allocation. 17229 // 17230 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local 17231 // allocations work. 17232 if (HasSystemScope) { 17233 if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && 17234 RMW->hasMetadata("amdgpu.no.remote.memory")) 17235 return true; 17236 } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) 17237 return true; 17238 17239 return RMW->hasMetadata("amdgpu.no.fine.grained.memory"); 17240 } 17241 17242 /// \return Action to perform on AtomicRMWInsts for integer operations. 17243 static TargetLowering::AtomicExpansionKind 17244 atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { 17245 return isAtomicRMWLegalIntTy(RMW->getType()) 17246 ? TargetLowering::AtomicExpansionKind::None 17247 : TargetLowering::AtomicExpansionKind::CmpXChg; 17248 } 17249 17250 /// Return if a flat address space atomicrmw can access private memory. 17251 static bool flatInstrMayAccessPrivate(const Instruction *I) { 17252 const MDNode *NoaliasAddrSpaceMD = 17253 I->getMetadata(LLVMContext::MD_noalias_addrspace); 17254 if (!NoaliasAddrSpaceMD) 17255 return true; 17256 17257 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E; 17258 ++I) { 17259 auto *Low = mdconst::extract<ConstantInt>( 17260 NoaliasAddrSpaceMD->getOperand(2 * I + 0)); 17261 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) { 17262 auto *High = mdconst::extract<ConstantInt>( 17263 NoaliasAddrSpaceMD->getOperand(2 * I + 1)); 17264 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS); 17265 } 17266 } 17267 17268 return true; 17269 } 17270 17271 TargetLowering::AtomicExpansionKind 17272 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { 17273 unsigned AS = RMW->getPointerAddressSpace(); 17274 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 17275 return AtomicExpansionKind::NotAtomic; 17276 17277 // 64-bit flat atomics that dynamically reside in private memory will silently 17278 // be dropped. 17279 // 17280 // Note that we will emit a new copy of the original atomic in the expansion, 17281 // which will be incrementally relegalized. 17282 const DataLayout &DL = RMW->getFunction()->getDataLayout(); 17283 if (AS == AMDGPUAS::FLAT_ADDRESS && 17284 DL.getTypeSizeInBits(RMW->getType()) == 64 && 17285 flatInstrMayAccessPrivate(RMW)) 17286 return AtomicExpansionKind::Expand; 17287 17288 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { 17289 OptimizationRemarkEmitter ORE(RMW->getFunction()); 17290 ORE.emit([=]() { 17291 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request."; 17292 }); 17293 return Kind; 17294 }; 17295 17296 auto SSID = RMW->getSyncScopeID(); 17297 bool HasSystemScope = 17298 SSID == SyncScope::System || 17299 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); 17300 17301 auto Op = RMW->getOperation(); 17302 switch (Op) { 17303 case AtomicRMWInst::Xchg: { 17304 // PCIe supports add and xchg for system atomics. 17305 return isAtomicRMWLegalXChgTy(RMW) 17306 ? TargetLowering::AtomicExpansionKind::None 17307 : TargetLowering::AtomicExpansionKind::CmpXChg; 17308 } 17309 case AtomicRMWInst::Add: 17310 case AtomicRMWInst::And: 17311 case AtomicRMWInst::UIncWrap: 17312 case AtomicRMWInst::UDecWrap: 17313 return atomicSupportedIfLegalIntType(RMW); 17314 case AtomicRMWInst::Sub: 17315 case AtomicRMWInst::Or: 17316 case AtomicRMWInst::Xor: { 17317 // Atomic sub/or/xor do not work over PCI express, but atomic add 17318 // does. InstCombine transforms these with 0 to or, so undo that. 17319 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { 17320 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); 17321 ConstVal && ConstVal->isNullValue()) 17322 return AtomicExpansionKind::Expand; 17323 } 17324 17325 return atomicSupportedIfLegalIntType(RMW); 17326 } 17327 case AtomicRMWInst::FAdd: { 17328 Type *Ty = RMW->getType(); 17329 17330 // TODO: Handle REGION_ADDRESS 17331 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 17332 // DS F32 FP atomics do respect the denormal mode, but the rounding mode 17333 // is fixed to round-to-nearest-even. 17334 // 17335 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to 17336 // round-to-nearest-even. 17337 // 17338 // We ignore the rounding mode problem, even in strictfp. The C++ standard 17339 // suggests it is OK if the floating-point mode may not match the calling 17340 // thread. 17341 if (Ty->isFloatTy()) { 17342 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None 17343 : AtomicExpansionKind::CmpXChg; 17344 } 17345 17346 if (Ty->isDoubleTy()) { 17347 // Ignores denormal mode, but we don't consider flushing mandatory. 17348 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None 17349 : AtomicExpansionKind::CmpXChg; 17350 } 17351 17352 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty)) 17353 return AtomicExpansionKind::None; 17354 17355 return AtomicExpansionKind::CmpXChg; 17356 } 17357 17358 // LDS atomics respect the denormal mode from the mode register. 17359 // 17360 // Traditionally f32 global/buffer memory atomics would unconditionally 17361 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never 17362 // flush. 17363 // 17364 // On targets with flat atomic fadd, denormals would flush depending on 17365 // whether the target address resides in LDS or global memory. We consider 17366 // this flat-maybe-flush as will-flush. 17367 if (Ty->isFloatTy() && 17368 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() && 17369 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW)) 17370 return AtomicExpansionKind::CmpXChg; 17371 17372 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are 17373 // safe. The message phrasing also should be better. 17374 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { 17375 if (AS == AMDGPUAS::FLAT_ADDRESS) { 17376 // gfx942, gfx12 17377 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty)) 17378 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17379 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { 17380 // gfx90a, gfx942, gfx12 17381 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) 17382 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17383 17384 // gfx942, gfx12 17385 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty)) 17386 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17387 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { 17388 // gfx90a, gfx942, gfx12 17389 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) 17390 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17391 17392 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for 17393 // buffer. gfx12 does have the buffer version. 17394 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty)) 17395 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17396 } 17397 17398 // global and flat atomic fadd f64: gfx90a, gfx942. 17399 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) 17400 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17401 17402 if (AS != AMDGPUAS::FLAT_ADDRESS) { 17403 if (Ty->isFloatTy()) { 17404 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942, 17405 // gfx11+. 17406 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) 17407 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17408 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+. 17409 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) 17410 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17411 } else { 17412 // gfx908 17413 if (RMW->use_empty() && 17414 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && 17415 isV2F16(Ty)) 17416 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17417 } 17418 } 17419 17420 // flat atomic fadd f32: gfx942, gfx11+. 17421 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { 17422 if (Subtarget->hasFlatAtomicFaddF32Inst()) 17423 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17424 17425 // If it is in flat address space, and the type is float, we will try to 17426 // expand it, if the target supports global and lds atomic fadd. The 17427 // reason we need that is, in the expansion, we emit the check of 17428 // address space. If it is in global address space, we emit the global 17429 // atomic fadd; if it is in shared address space, we emit the LDS atomic 17430 // fadd. 17431 if (Subtarget->hasLDSFPAtomicAddF32()) { 17432 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) 17433 return AtomicExpansionKind::Expand; 17434 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) 17435 return AtomicExpansionKind::Expand; 17436 } 17437 } 17438 } 17439 17440 return AtomicExpansionKind::CmpXChg; 17441 } 17442 case AtomicRMWInst::FMin: 17443 case AtomicRMWInst::FMax: { 17444 Type *Ty = RMW->getType(); 17445 17446 // LDS float and double fmin/fmax were always supported. 17447 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 17448 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None 17449 : AtomicExpansionKind::CmpXChg; 17450 } 17451 17452 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { 17453 // For flat and global cases: 17454 // float, double in gfx7. Manual claims denormal support. 17455 // Removed in gfx8. 17456 // float, double restored in gfx10. 17457 // double removed again in gfx11, so only f32 for gfx11/gfx12. 17458 // 17459 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but 17460 // no f32. 17461 if (AS == AMDGPUAS::FLAT_ADDRESS) { 17462 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) 17463 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17464 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) 17465 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17466 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || 17467 AS == AMDGPUAS::BUFFER_FAT_POINTER) { 17468 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) 17469 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17470 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) 17471 return ReportUnsafeHWInst(AtomicExpansionKind::None); 17472 } 17473 } 17474 17475 return AtomicExpansionKind::CmpXChg; 17476 } 17477 case AtomicRMWInst::Min: 17478 case AtomicRMWInst::Max: 17479 case AtomicRMWInst::UMin: 17480 case AtomicRMWInst::UMax: { 17481 if (AMDGPU::isFlatGlobalAddrSpace(AS) || 17482 AS == AMDGPUAS::BUFFER_FAT_POINTER) { 17483 // Always expand system scope min/max atomics. 17484 if (HasSystemScope) 17485 return AtomicExpansionKind::CmpXChg; 17486 } 17487 17488 return atomicSupportedIfLegalIntType(RMW); 17489 } 17490 case AtomicRMWInst::Nand: 17491 case AtomicRMWInst::FSub: 17492 default: 17493 return AtomicExpansionKind::CmpXChg; 17494 } 17495 17496 llvm_unreachable("covered atomicrmw op switch"); 17497 } 17498 17499 TargetLowering::AtomicExpansionKind 17500 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 17501 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 17502 ? AtomicExpansionKind::NotAtomic 17503 : AtomicExpansionKind::None; 17504 } 17505 17506 TargetLowering::AtomicExpansionKind 17507 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 17508 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 17509 ? AtomicExpansionKind::NotAtomic 17510 : AtomicExpansionKind::None; 17511 } 17512 17513 TargetLowering::AtomicExpansionKind 17514 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { 17515 unsigned AddrSpace = CmpX->getPointerAddressSpace(); 17516 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 17517 return AtomicExpansionKind::NotAtomic; 17518 17519 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) 17520 return AtomicExpansionKind::None; 17521 17522 const DataLayout &DL = CmpX->getDataLayout(); 17523 17524 Type *ValTy = CmpX->getNewValOperand()->getType(); 17525 17526 // If a 64-bit flat atomic may alias private, we need to avoid using the 17527 // atomic in the private case. 17528 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand 17529 : AtomicExpansionKind::None; 17530 } 17531 17532 const TargetRegisterClass * 17533 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 17534 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); 17535 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17536 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) 17537 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass 17538 : &AMDGPU::SReg_32RegClass; 17539 if (!TRI->isSGPRClass(RC) && !isDivergent) 17540 return TRI->getEquivalentSGPRClass(RC); 17541 if (TRI->isSGPRClass(RC) && isDivergent) 17542 return TRI->getEquivalentVGPRClass(RC); 17543 17544 return RC; 17545 } 17546 17547 // FIXME: This is a workaround for DivergenceAnalysis not understanding always 17548 // uniform values (as produced by the mask results of control flow intrinsics) 17549 // used outside of divergent blocks. The phi users need to also be treated as 17550 // always uniform. 17551 // 17552 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis? 17553 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, 17554 unsigned WaveSize) { 17555 // FIXME: We assume we never cast the mask results of a control flow 17556 // intrinsic. 17557 // Early exit if the type won't be consistent as a compile time hack. 17558 IntegerType *IT = dyn_cast<IntegerType>(V->getType()); 17559 if (!IT || IT->getBitWidth() != WaveSize) 17560 return false; 17561 17562 if (!isa<Instruction>(V)) 17563 return false; 17564 if (!Visited.insert(V).second) 17565 return false; 17566 bool Result = false; 17567 for (const auto *U : V->users()) { 17568 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { 17569 if (V == U->getOperand(1)) { 17570 switch (Intrinsic->getIntrinsicID()) { 17571 default: 17572 Result = false; 17573 break; 17574 case Intrinsic::amdgcn_if_break: 17575 case Intrinsic::amdgcn_if: 17576 case Intrinsic::amdgcn_else: 17577 Result = true; 17578 break; 17579 } 17580 } 17581 if (V == U->getOperand(0)) { 17582 switch (Intrinsic->getIntrinsicID()) { 17583 default: 17584 Result = false; 17585 break; 17586 case Intrinsic::amdgcn_end_cf: 17587 case Intrinsic::amdgcn_loop: 17588 Result = true; 17589 break; 17590 } 17591 } 17592 } else { 17593 Result = hasCFUser(U, Visited, WaveSize); 17594 } 17595 if (Result) 17596 break; 17597 } 17598 return Result; 17599 } 17600 17601 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, 17602 const Value *V) const { 17603 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 17604 if (CI->isInlineAsm()) { 17605 // FIXME: This cannot give a correct answer. This should only trigger in 17606 // the case where inline asm returns mixed SGPR and VGPR results, used 17607 // outside the defining block. We don't have a specific result to 17608 // consider, so this assumes if any value is SGPR, the overall register 17609 // also needs to be SGPR. 17610 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); 17611 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( 17612 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); 17613 for (auto &TC : TargetConstraints) { 17614 if (TC.Type == InlineAsm::isOutput) { 17615 ComputeConstraintToUse(TC, SDValue()); 17616 const TargetRegisterClass *RC = 17617 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode, 17618 TC.ConstraintVT) 17619 .second; 17620 if (RC && SIRI->isSGPRClass(RC)) 17621 return true; 17622 } 17623 } 17624 } 17625 } 17626 SmallPtrSet<const Value *, 16> Visited; 17627 return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); 17628 } 17629 17630 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { 17631 for (SDUse &Use : N->uses()) { 17632 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) { 17633 if (getBasePtrIndex(M) == Use.getOperandNo()) 17634 return true; 17635 } 17636 } 17637 return false; 17638 } 17639 17640 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, 17641 SDValue N1) const { 17642 if (!N0.hasOneUse()) 17643 return false; 17644 // Take care of the opportunity to keep N0 uniform 17645 if (N0->isDivergent() || !N1->isDivergent()) 17646 return true; 17647 // Check if we have a good chance to form the memory access pattern with the 17648 // base and offset 17649 return (DAG.isBaseWithConstantOffset(N0) && 17650 hasMemSDNodeUser(*N0->user_begin())); 17651 } 17652 17653 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 17654 Register N0, Register N1) const { 17655 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 17656 } 17657 17658 MachineMemOperand::Flags 17659 SITargetLowering::getTargetMMOFlags(const Instruction &I) const { 17660 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. 17661 MachineMemOperand::Flags Flags = MachineMemOperand::MONone; 17662 if (I.getMetadata("amdgpu.noclobber")) 17663 Flags |= MONoClobber; 17664 if (I.getMetadata("amdgpu.last.use")) 17665 Flags |= MOLastUse; 17666 return Flags; 17667 } 17668 17669 bool SITargetLowering::checkForPhysRegDependency( 17670 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, 17671 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const { 17672 if (User->getOpcode() != ISD::CopyToReg) 17673 return false; 17674 if (!Def->isMachineOpcode()) 17675 return false; 17676 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def); 17677 if (!MDef) 17678 return false; 17679 17680 unsigned ResNo = User->getOperand(Op).getResNo(); 17681 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1) 17682 return false; 17683 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode()); 17684 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) { 17685 PhysReg = AMDGPU::SCC; 17686 const TargetRegisterClass *RC = 17687 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); 17688 Cost = RC->getCopyCost(); 17689 return true; 17690 } 17691 return false; 17692 } 17693 17694 void SITargetLowering::emitExpandAtomicAddrSpacePredicate( 17695 Instruction *AI) const { 17696 // Given: atomicrmw fadd ptr %addr, float %val ordering 17697 // 17698 // With this expansion we produce the following code: 17699 // [...] 17700 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr) 17701 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private 17702 // 17703 // atomicrmw.shared: 17704 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3) 17705 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared, 17706 // float %val ordering 17707 // br label %atomicrmw.phi 17708 // 17709 // atomicrmw.check.private: 17710 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr) 17711 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global 17712 // 17713 // atomicrmw.private: 17714 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5) 17715 // %loaded.private = load float, ptr addrspace(5) %cast.private 17716 // %val.new = fadd float %loaded.private, %val 17717 // store float %val.new, ptr addrspace(5) %cast.private 17718 // br label %atomicrmw.phi 17719 // 17720 // atomicrmw.global: 17721 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1) 17722 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global, 17723 // float %val ordering 17724 // br label %atomicrmw.phi 17725 // 17726 // atomicrmw.phi: 17727 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ], 17728 // [ %loaded.private, %atomicrmw.private ], 17729 // [ %loaded.global, %atomicrmw.global ] 17730 // br label %atomicrmw.end 17731 // 17732 // atomicrmw.end: 17733 // [...] 17734 // 17735 // 17736 // For 64-bit atomics which may reside in private memory, we perform a simpler 17737 // version that only inserts the private check, and uses the flat operation. 17738 17739 IRBuilder<> Builder(AI); 17740 LLVMContext &Ctx = Builder.getContext(); 17741 17742 auto *RMW = dyn_cast<AtomicRMWInst>(AI); 17743 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex() 17744 : AtomicCmpXchgInst::getPointerOperandIndex(); 17745 Value *Addr = AI->getOperand(PtrOpIdx); 17746 17747 /// TODO: Only need to check private, then emit flat-known-not private (no 17748 /// need for shared block, or cast to global). 17749 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI); 17750 17751 Align Alignment; 17752 if (RMW) 17753 Alignment = RMW->getAlign(); 17754 else if (CX) 17755 Alignment = CX->getAlign(); 17756 else 17757 llvm_unreachable("unhandled atomic operation"); 17758 17759 // FullFlatEmulation is true if we need to issue the private, shared, and 17760 // global cases. 17761 // 17762 // If this is false, we are only dealing with the flat-targeting-private case, 17763 // where we only insert a check for private and still use the flat instruction 17764 // for global and shared. 17765 17766 bool FullFlatEmulation = 17767 RMW && RMW->getOperation() == AtomicRMWInst::FAdd && 17768 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) || 17769 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && 17770 RMW->getType()->isDoubleTy())); 17771 17772 // If the return value isn't used, do not introduce a false use in the phi. 17773 bool ReturnValueIsUsed = !AI->use_empty(); 17774 17775 BasicBlock *BB = Builder.GetInsertBlock(); 17776 Function *F = BB->getParent(); 17777 BasicBlock *ExitBB = 17778 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 17779 BasicBlock *SharedBB = nullptr; 17780 17781 BasicBlock *CheckPrivateBB = BB; 17782 if (FullFlatEmulation) { 17783 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); 17784 CheckPrivateBB = 17785 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); 17786 } 17787 17788 BasicBlock *PrivateBB = 17789 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); 17790 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); 17791 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); 17792 17793 std::prev(BB->end())->eraseFromParent(); 17794 Builder.SetInsertPoint(BB); 17795 17796 Value *LoadedShared = nullptr; 17797 if (FullFlatEmulation) { 17798 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, 17799 {Addr}, nullptr, "is.shared"); 17800 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); 17801 Builder.SetInsertPoint(SharedBB); 17802 Value *CastToLocal = Builder.CreateAddrSpaceCast( 17803 Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); 17804 17805 Instruction *Clone = AI->clone(); 17806 Clone->insertInto(SharedBB, SharedBB->end()); 17807 Clone->getOperandUse(PtrOpIdx).set(CastToLocal); 17808 LoadedShared = Clone; 17809 17810 Builder.CreateBr(PhiBB); 17811 Builder.SetInsertPoint(CheckPrivateBB); 17812 } 17813 17814 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private, 17815 {Addr}, nullptr, "is.private"); 17816 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); 17817 17818 Builder.SetInsertPoint(PrivateBB); 17819 17820 Value *CastToPrivate = Builder.CreateAddrSpaceCast( 17821 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS)); 17822 17823 Value *LoadedPrivate; 17824 if (RMW) { 17825 LoadedPrivate = Builder.CreateAlignedLoad( 17826 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private"); 17827 17828 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder, 17829 LoadedPrivate, RMW->getValOperand()); 17830 17831 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign()); 17832 } else { 17833 auto [ResultLoad, Equal] = 17834 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(), 17835 CX->getNewValOperand(), CX->getAlign()); 17836 17837 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()), 17838 ResultLoad, 0); 17839 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1); 17840 } 17841 17842 Builder.CreateBr(PhiBB); 17843 17844 Builder.SetInsertPoint(GlobalBB); 17845 17846 // Continue using a flat instruction if we only emitted the check for private. 17847 Instruction *LoadedGlobal = AI; 17848 if (FullFlatEmulation) { 17849 Value *CastToGlobal = Builder.CreateAddrSpaceCast( 17850 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); 17851 AI->getOperandUse(PtrOpIdx).set(CastToGlobal); 17852 } 17853 17854 AI->removeFromParent(); 17855 AI->insertInto(GlobalBB, GlobalBB->end()); 17856 17857 // The new atomicrmw may go through another round of legalization later. 17858 if (!FullFlatEmulation) { 17859 // We inserted the runtime check already, make sure we do not try to 17860 // re-expand this. 17861 // TODO: Should union with any existing metadata. 17862 MDBuilder MDB(F->getContext()); 17863 MDNode *RangeNotPrivate = 17864 MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS), 17865 APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1)); 17866 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace, 17867 RangeNotPrivate); 17868 } 17869 17870 Builder.CreateBr(PhiBB); 17871 17872 Builder.SetInsertPoint(PhiBB); 17873 17874 if (ReturnValueIsUsed) { 17875 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3); 17876 AI->replaceAllUsesWith(Loaded); 17877 if (FullFlatEmulation) 17878 Loaded->addIncoming(LoadedShared, SharedBB); 17879 Loaded->addIncoming(LoadedPrivate, PrivateBB); 17880 Loaded->addIncoming(LoadedGlobal, GlobalBB); 17881 Loaded->takeName(AI); 17882 } 17883 17884 Builder.CreateBr(ExitBB); 17885 } 17886 17887 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { 17888 AtomicRMWInst::BinOp Op = AI->getOperation(); 17889 17890 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || 17891 Op == AtomicRMWInst::Xor) { 17892 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); 17893 ConstVal && ConstVal->isNullValue()) { 17894 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 17895 AI->setOperation(AtomicRMWInst::Add); 17896 17897 // We may still need the private-alias-flat handling below. 17898 17899 // TODO: Skip this for cases where we cannot access remote memory. 17900 } 17901 } 17902 17903 // The non-flat expansions should only perform the de-canonicalization of 17904 // identity values. 17905 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) 17906 return; 17907 17908 emitExpandAtomicAddrSpacePredicate(AI); 17909 } 17910 17911 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { 17912 emitExpandAtomicAddrSpacePredicate(CI); 17913 } 17914 17915 LoadInst * 17916 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { 17917 IRBuilder<> Builder(AI); 17918 auto Order = AI->getOrdering(); 17919 17920 // The optimization removes store aspect of the atomicrmw. Therefore, cache 17921 // must be flushed if the atomic ordering had a release semantics. This is 17922 // not necessary a fence, a release fence just coincides to do that flush. 17923 // Avoid replacing of an atomicrmw with a release semantics. 17924 if (isReleaseOrStronger(Order)) 17925 return nullptr; 17926 17927 LoadInst *LI = Builder.CreateAlignedLoad( 17928 AI->getType(), AI->getPointerOperand(), AI->getAlign()); 17929 LI->setAtomic(Order, AI->getSyncScopeID()); 17930 LI->copyMetadata(*AI); 17931 LI->takeName(AI); 17932 AI->replaceAllUsesWith(LI); 17933 AI->eraseFromParent(); 17934 return LI; 17935 } 17936