1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 /// The analysis takes callees into account. E.g. if a function A that needs 10 17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18 /// will return 20. 19 /// It is assumed that an indirect call can go into any function except 20 /// hardware-entrypoints. Therefore the register usage of functions with 21 /// indirect calls is estimated as the maximum of all non-entrypoint functions 22 /// in the module. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPUResourceUsageAnalysis.h" 27 #include "AMDGPU.h" 28 #include "GCNSubtarget.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "llvm/Analysis/CallGraph.h" 31 #include "llvm/CodeGen/TargetPassConfig.h" 32 #include "llvm/IR/GlobalAlias.h" 33 #include "llvm/IR/GlobalValue.h" 34 #include "llvm/Target/TargetMachine.h" 35 36 using namespace llvm; 37 using namespace llvm::AMDGPU; 38 39 #define DEBUG_TYPE "amdgpu-resource-usage" 40 41 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 42 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 43 44 // We need to tell the runtime some amount ahead of time if we don't know the 45 // true stack size. Assume a smaller number if this is only due to dynamic / 46 // non-entry block allocas. 47 static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 48 "amdgpu-assume-external-call-stack-size", 49 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 50 cl::init(16384)); 51 52 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 53 "amdgpu-assume-dynamic-stack-object-size", 54 cl::desc("Assumed extra stack use if there are any " 55 "variable sized objects (in bytes)"), 56 cl::Hidden, cl::init(4096)); 57 58 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 59 "Function register usage analysis", true, true) 60 61 static const Function *getCalleeFunction(const MachineOperand &Op) { 62 if (Op.isImm()) { 63 assert(Op.getImm() == 0); 64 return nullptr; 65 } 66 if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal())) 67 return cast<Function>(GA->getOperand(0)); 68 return cast<Function>(Op.getGlobal()); 69 } 70 71 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 72 const SIInstrInfo &TII, unsigned Reg) { 73 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 74 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 75 return true; 76 } 77 78 return false; 79 } 80 81 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 82 const GCNSubtarget &ST) const { 83 return NumExplicitSGPR + 84 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 85 ST.getTargetID().isXnackOnOrAny()); 86 } 87 88 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 89 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { 90 if (ST.hasGFX90AInsts() && ArgNumAGPR) 91 return alignTo(ArgNumVGPR, 4) + ArgNumAGPR; 92 return std::max(ArgNumVGPR, ArgNumAGPR); 93 } 94 95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 96 const GCNSubtarget &ST) const { 97 return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); 98 } 99 100 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { 101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 102 if (!TPC) 103 return false; 104 105 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 106 bool HasIndirectCall = false; 107 108 for (CallGraphNode *I : SCC) { 109 Function *F = I->getFunction(); 110 if (!F || F->isDeclaration()) 111 continue; 112 113 MachineModuleInfo &MMI = 114 getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 115 MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); 116 117 auto CI = CallGraphResourceInfo.insert( 118 std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); 119 SIFunctionResourceInfo &Info = CI.first->second; 120 assert(CI.second && "should only be called once per function"); 121 Info = analyzeResourceUsage(MF, TM); 122 HasIndirectCall |= Info.HasIndirectCall; 123 } 124 125 if (HasIndirectCall) 126 propagateIndirectCallRegisterUsage(); 127 128 return false; 129 } 130 131 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 132 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 133 const MachineFunction &MF, const TargetMachine &TM) const { 134 SIFunctionResourceInfo Info; 135 136 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 137 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 138 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 139 const MachineRegisterInfo &MRI = MF.getRegInfo(); 140 const SIInstrInfo *TII = ST.getInstrInfo(); 141 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 142 143 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 144 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 145 MRI.isLiveIn(MFI->getPreloadedReg( 146 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 147 148 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 149 // instructions aren't used to access the scratch buffer. Inline assembly may 150 // need it though. 151 // 152 // If we only have implicit uses of flat_scr on flat instructions, it is not 153 // really needed. 154 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 155 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 156 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 157 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 158 Info.UsesFlatScratch = false; 159 } 160 161 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 162 163 // Assume a big number if there are any unknown sized objects. 164 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 165 if (Info.HasDynamicallySizedStack) 166 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 167 168 if (MFI->isStackRealigned()) 169 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 170 171 Info.UsesVCC = 172 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 173 174 // If there are no calls, MachineRegisterInfo can tell us the used register 175 // count easily. 176 // A tail call isn't considered a call for MachineFrameInfo's purposes. 177 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 178 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 179 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 180 if (MRI.isPhysRegUsed(Reg)) { 181 HighestVGPRReg = Reg; 182 break; 183 } 184 } 185 186 if (ST.hasMAIInsts()) { 187 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 188 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 189 if (MRI.isPhysRegUsed(Reg)) { 190 HighestAGPRReg = Reg; 191 break; 192 } 193 } 194 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 195 ? 0 196 : TRI.getHWRegIndex(HighestAGPRReg) + 1; 197 } 198 199 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 200 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 201 if (MRI.isPhysRegUsed(Reg)) { 202 HighestSGPRReg = Reg; 203 break; 204 } 205 } 206 207 // We found the maximum register index. They start at 0, so add one to get 208 // the number of registers. 209 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 210 ? 0 211 : TRI.getHWRegIndex(HighestVGPRReg) + 1; 212 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 213 ? 0 214 : TRI.getHWRegIndex(HighestSGPRReg) + 1; 215 216 return Info; 217 } 218 219 int32_t MaxVGPR = -1; 220 int32_t MaxAGPR = -1; 221 int32_t MaxSGPR = -1; 222 uint64_t CalleeFrameSize = 0; 223 224 for (const MachineBasicBlock &MBB : MF) { 225 for (const MachineInstr &MI : MBB) { 226 // TODO: Check regmasks? Do they occur anywhere except calls? 227 for (const MachineOperand &MO : MI.operands()) { 228 unsigned Width = 0; 229 bool IsSGPR = false; 230 bool IsAGPR = false; 231 232 if (!MO.isReg()) 233 continue; 234 235 Register Reg = MO.getReg(); 236 switch (Reg) { 237 case AMDGPU::EXEC: 238 case AMDGPU::EXEC_LO: 239 case AMDGPU::EXEC_HI: 240 case AMDGPU::SCC: 241 case AMDGPU::M0: 242 case AMDGPU::M0_LO16: 243 case AMDGPU::M0_HI16: 244 case AMDGPU::SRC_SHARED_BASE: 245 case AMDGPU::SRC_SHARED_LIMIT: 246 case AMDGPU::SRC_PRIVATE_BASE: 247 case AMDGPU::SRC_PRIVATE_LIMIT: 248 case AMDGPU::SGPR_NULL: 249 case AMDGPU::MODE: 250 continue; 251 252 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 253 llvm_unreachable("src_pops_exiting_wave_id should not be used"); 254 255 case AMDGPU::NoRegister: 256 assert(MI.isDebugInstr() && 257 "Instruction uses invalid noreg register"); 258 continue; 259 260 case AMDGPU::VCC: 261 case AMDGPU::VCC_LO: 262 case AMDGPU::VCC_HI: 263 case AMDGPU::VCC_LO_LO16: 264 case AMDGPU::VCC_LO_HI16: 265 case AMDGPU::VCC_HI_LO16: 266 case AMDGPU::VCC_HI_HI16: 267 Info.UsesVCC = true; 268 continue; 269 270 case AMDGPU::FLAT_SCR: 271 case AMDGPU::FLAT_SCR_LO: 272 case AMDGPU::FLAT_SCR_HI: 273 continue; 274 275 case AMDGPU::XNACK_MASK: 276 case AMDGPU::XNACK_MASK_LO: 277 case AMDGPU::XNACK_MASK_HI: 278 llvm_unreachable("xnack_mask registers should not be used"); 279 280 case AMDGPU::LDS_DIRECT: 281 llvm_unreachable("lds_direct register should not be used"); 282 283 case AMDGPU::TBA: 284 case AMDGPU::TBA_LO: 285 case AMDGPU::TBA_HI: 286 case AMDGPU::TMA: 287 case AMDGPU::TMA_LO: 288 case AMDGPU::TMA_HI: 289 llvm_unreachable("trap handler registers should not be used"); 290 291 case AMDGPU::SRC_VCCZ: 292 llvm_unreachable("src_vccz register should not be used"); 293 294 case AMDGPU::SRC_EXECZ: 295 llvm_unreachable("src_execz register should not be used"); 296 297 case AMDGPU::SRC_SCC: 298 llvm_unreachable("src_scc register should not be used"); 299 300 default: 301 break; 302 } 303 304 if (AMDGPU::SReg_32RegClass.contains(Reg) || 305 AMDGPU::SReg_LO16RegClass.contains(Reg) || 306 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 307 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 308 "trap handler registers should not be used"); 309 IsSGPR = true; 310 Width = 1; 311 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 312 AMDGPU::VGPR_LO16RegClass.contains(Reg) || 313 AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 314 IsSGPR = false; 315 Width = 1; 316 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 317 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 318 IsSGPR = false; 319 IsAGPR = true; 320 Width = 1; 321 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 322 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 323 "trap handler registers should not be used"); 324 IsSGPR = true; 325 Width = 2; 326 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 327 IsSGPR = false; 328 Width = 2; 329 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 330 IsSGPR = false; 331 IsAGPR = true; 332 Width = 2; 333 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 334 IsSGPR = false; 335 Width = 3; 336 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 337 IsSGPR = true; 338 Width = 3; 339 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 340 IsSGPR = false; 341 IsAGPR = true; 342 Width = 3; 343 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 344 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && 345 "trap handler registers should not be used"); 346 IsSGPR = true; 347 Width = 4; 348 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 349 IsSGPR = false; 350 Width = 4; 351 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 352 IsSGPR = false; 353 IsAGPR = true; 354 Width = 4; 355 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 356 IsSGPR = false; 357 Width = 5; 358 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 359 IsSGPR = true; 360 Width = 5; 361 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 362 IsSGPR = false; 363 IsAGPR = true; 364 Width = 5; 365 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 366 IsSGPR = false; 367 Width = 6; 368 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 369 IsSGPR = true; 370 Width = 6; 371 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 372 IsSGPR = false; 373 IsAGPR = true; 374 Width = 6; 375 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 376 IsSGPR = false; 377 Width = 7; 378 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 379 IsSGPR = true; 380 Width = 7; 381 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 382 IsSGPR = false; 383 IsAGPR = true; 384 Width = 7; 385 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 386 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && 387 "trap handler registers should not be used"); 388 IsSGPR = true; 389 Width = 8; 390 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 391 IsSGPR = false; 392 Width = 8; 393 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 394 IsSGPR = false; 395 IsAGPR = true; 396 Width = 8; 397 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 398 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && 399 "trap handler registers should not be used"); 400 IsSGPR = true; 401 Width = 16; 402 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 403 IsSGPR = false; 404 Width = 16; 405 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 406 IsSGPR = false; 407 IsAGPR = true; 408 Width = 16; 409 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 410 IsSGPR = true; 411 Width = 32; 412 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 413 IsSGPR = false; 414 Width = 32; 415 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 416 IsSGPR = false; 417 IsAGPR = true; 418 Width = 32; 419 } else { 420 llvm_unreachable("Unknown register class"); 421 } 422 unsigned HWReg = TRI.getHWRegIndex(Reg); 423 int MaxUsed = HWReg + Width - 1; 424 if (IsSGPR) { 425 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 426 } else if (IsAGPR) { 427 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 428 } else { 429 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 430 } 431 } 432 433 if (MI.isCall()) { 434 // Pseudo used just to encode the underlying global. Is there a better 435 // way to track this? 436 437 const MachineOperand *CalleeOp = 438 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 439 440 const Function *Callee = getCalleeFunction(*CalleeOp); 441 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 442 CallGraphResourceInfo.end(); 443 444 // Avoid crashing on undefined behavior with an illegal call to a 445 // kernel. If a callsite's calling convention doesn't match the 446 // function's, it's undefined behavior. If the callsite calling 447 // convention does match, that would have errored earlier. 448 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 449 report_fatal_error("invalid call to entry function"); 450 451 bool IsIndirect = !Callee || Callee->isDeclaration(); 452 if (!IsIndirect) 453 I = CallGraphResourceInfo.find(Callee); 454 455 // FIXME: Call site could have norecurse on it 456 if (!Callee || !Callee->doesNotRecurse()) { 457 Info.HasRecursion = true; 458 459 // TODO: If we happen to know there is no stack usage in the 460 // callgraph, we don't need to assume an infinitely growing stack. 461 if (!MI.isReturn()) { 462 // We don't need to assume an unknown stack size for tail calls. 463 464 // FIXME: This only benefits in the case where the kernel does not 465 // directly call the tail called function. If a kernel directly 466 // calls a tail recursive function, we'll assume maximum stack size 467 // based on the regular call instruction. 468 CalleeFrameSize = 469 std::max(CalleeFrameSize, 470 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 471 } 472 } 473 474 if (IsIndirect || I == CallGraphResourceInfo.end()) { 475 CalleeFrameSize = 476 std::max(CalleeFrameSize, 477 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 478 479 // Register usage of indirect calls gets handled later 480 Info.UsesVCC = true; 481 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 482 Info.HasDynamicallySizedStack = true; 483 Info.HasIndirectCall = true; 484 } else { 485 // We force CodeGen to run in SCC order, so the callee's register 486 // usage etc. should be the cumulative usage of all callees. 487 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 488 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 489 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 490 CalleeFrameSize = 491 std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 492 Info.UsesVCC |= I->second.UsesVCC; 493 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 494 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 495 Info.HasRecursion |= I->second.HasRecursion; 496 Info.HasIndirectCall |= I->second.HasIndirectCall; 497 } 498 } 499 } 500 } 501 502 Info.NumExplicitSGPR = MaxSGPR + 1; 503 Info.NumVGPR = MaxVGPR + 1; 504 Info.NumAGPR = MaxAGPR + 1; 505 Info.PrivateSegmentSize += CalleeFrameSize; 506 507 return Info; 508 } 509 510 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 511 // Collect the maximum number of registers from non-hardware-entrypoints. 512 // All these functions are potential targets for indirect calls. 513 int32_t NonKernelMaxSGPRs = 0; 514 int32_t NonKernelMaxVGPRs = 0; 515 int32_t NonKernelMaxAGPRs = 0; 516 517 for (const auto &I : CallGraphResourceInfo) { 518 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 519 auto &Info = I.getSecond(); 520 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 521 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 522 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 523 } 524 } 525 526 // Add register usage for functions with indirect calls. 527 // For calls to unknown functions, we assume the maximum register usage of 528 // all non-hardware-entrypoints in the current module. 529 for (auto &I : CallGraphResourceInfo) { 530 auto &Info = I.getSecond(); 531 if (Info.HasIndirectCall) { 532 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 533 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 534 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 535 } 536 } 537 } 538