1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 /// The analysis takes callees into account. E.g. if a function A that needs 10 17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18 /// will return 20. 19 /// It is assumed that an indirect call can go into any function except 20 /// hardware-entrypoints. Therefore the register usage of functions with 21 /// indirect calls is estimated as the maximum of all non-entrypoint functions 22 /// in the module. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPUResourceUsageAnalysis.h" 27 #include "AMDGPU.h" 28 #include "GCNSubtarget.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/Analysis/CallGraph.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/TargetPassConfig.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/Target/TargetMachine.h" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 #define DEBUG_TYPE "amdgpu-resource-usage" 42 43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 45 46 // We need to tell the runtime some amount ahead of time if we don't know the 47 // true stack size. Assume a smaller number if this is only due to dynamic / 48 // non-entry block allocas. 49 static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 50 "amdgpu-assume-external-call-stack-size", 51 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 52 cl::init(16384)); 53 54 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 55 "amdgpu-assume-dynamic-stack-object-size", 56 cl::desc("Assumed extra stack use if there are any " 57 "variable sized objects (in bytes)"), 58 cl::Hidden, cl::init(4096)); 59 60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 61 "Function register usage analysis", true, true) 62 63 static const Function *getCalleeFunction(const MachineOperand &Op) { 64 if (Op.isImm()) { 65 assert(Op.getImm() == 0); 66 return nullptr; 67 } 68 if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal())) 69 return cast<Function>(GA->getOperand(0)); 70 return cast<Function>(Op.getGlobal()); 71 } 72 73 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 74 const SIInstrInfo &TII, unsigned Reg) { 75 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 76 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 77 return true; 78 } 79 80 return false; 81 } 82 83 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 84 const GCNSubtarget &ST) const { 85 return NumExplicitSGPR + 86 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 87 ST.getTargetID().isXnackOnOrAny()); 88 } 89 90 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 91 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { 92 return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); 93 } 94 95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 96 const GCNSubtarget &ST) const { 97 return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); 98 } 99 100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { 101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 102 if (!TPC) 103 return false; 104 105 MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 106 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 107 bool HasIndirectCall = false; 108 109 CallGraph CG = CallGraph(M); 110 auto End = po_end(&CG); 111 112 for (auto IT = po_begin(&CG); IT != End; ++IT) { 113 Function *F = IT->getFunction(); 114 if (!F || F->isDeclaration()) 115 continue; 116 117 MachineFunction *MF = MMI.getMachineFunction(*F); 118 assert(MF && "function must have been generated already"); 119 120 auto CI = CallGraphResourceInfo.insert( 121 std::make_pair(F, SIFunctionResourceInfo())); 122 SIFunctionResourceInfo &Info = CI.first->second; 123 assert(CI.second && "should only be called once per function"); 124 Info = analyzeResourceUsage(*MF, TM); 125 HasIndirectCall |= Info.HasIndirectCall; 126 } 127 128 if (HasIndirectCall) 129 propagateIndirectCallRegisterUsage(); 130 131 return false; 132 } 133 134 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 135 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 136 const MachineFunction &MF, const TargetMachine &TM) const { 137 SIFunctionResourceInfo Info; 138 139 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 140 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 141 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 142 const MachineRegisterInfo &MRI = MF.getRegInfo(); 143 const SIInstrInfo *TII = ST.getInstrInfo(); 144 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 145 146 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 147 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 148 MRI.isLiveIn(MFI->getPreloadedReg( 149 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 150 151 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 152 // instructions aren't used to access the scratch buffer. Inline assembly may 153 // need it though. 154 // 155 // If we only have implicit uses of flat_scr on flat instructions, it is not 156 // really needed. 157 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 158 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 159 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 160 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 161 Info.UsesFlatScratch = false; 162 } 163 164 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 165 166 // Assume a big number if there are any unknown sized objects. 167 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 168 if (Info.HasDynamicallySizedStack) 169 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 170 171 if (MFI->isStackRealigned()) 172 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 173 174 Info.UsesVCC = 175 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 176 177 // If there are no calls, MachineRegisterInfo can tell us the used register 178 // count easily. 179 // A tail call isn't considered a call for MachineFrameInfo's purposes. 180 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 181 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 182 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 183 if (MRI.isPhysRegUsed(Reg)) { 184 HighestVGPRReg = Reg; 185 break; 186 } 187 } 188 189 if (ST.hasMAIInsts()) { 190 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 191 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 192 if (MRI.isPhysRegUsed(Reg)) { 193 HighestAGPRReg = Reg; 194 break; 195 } 196 } 197 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 198 ? 0 199 : TRI.getHWRegIndex(HighestAGPRReg) + 1; 200 } 201 202 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 203 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 204 if (MRI.isPhysRegUsed(Reg)) { 205 HighestSGPRReg = Reg; 206 break; 207 } 208 } 209 210 // We found the maximum register index. They start at 0, so add one to get 211 // the number of registers. 212 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 213 ? 0 214 : TRI.getHWRegIndex(HighestVGPRReg) + 1; 215 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 216 ? 0 217 : TRI.getHWRegIndex(HighestSGPRReg) + 1; 218 219 return Info; 220 } 221 222 int32_t MaxVGPR = -1; 223 int32_t MaxAGPR = -1; 224 int32_t MaxSGPR = -1; 225 uint64_t CalleeFrameSize = 0; 226 227 for (const MachineBasicBlock &MBB : MF) { 228 for (const MachineInstr &MI : MBB) { 229 // TODO: Check regmasks? Do they occur anywhere except calls? 230 for (const MachineOperand &MO : MI.operands()) { 231 unsigned Width = 0; 232 bool IsSGPR = false; 233 bool IsAGPR = false; 234 235 if (!MO.isReg()) 236 continue; 237 238 Register Reg = MO.getReg(); 239 switch (Reg) { 240 case AMDGPU::EXEC: 241 case AMDGPU::EXEC_LO: 242 case AMDGPU::EXEC_HI: 243 case AMDGPU::SCC: 244 case AMDGPU::M0: 245 case AMDGPU::M0_LO16: 246 case AMDGPU::M0_HI16: 247 case AMDGPU::SRC_SHARED_BASE: 248 case AMDGPU::SRC_SHARED_LIMIT: 249 case AMDGPU::SRC_PRIVATE_BASE: 250 case AMDGPU::SRC_PRIVATE_LIMIT: 251 case AMDGPU::SGPR_NULL: 252 case AMDGPU::SGPR_NULL64: 253 case AMDGPU::MODE: 254 continue; 255 256 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 257 llvm_unreachable("src_pops_exiting_wave_id should not be used"); 258 259 case AMDGPU::NoRegister: 260 assert(MI.isDebugInstr() && 261 "Instruction uses invalid noreg register"); 262 continue; 263 264 case AMDGPU::VCC: 265 case AMDGPU::VCC_LO: 266 case AMDGPU::VCC_HI: 267 case AMDGPU::VCC_LO_LO16: 268 case AMDGPU::VCC_LO_HI16: 269 case AMDGPU::VCC_HI_LO16: 270 case AMDGPU::VCC_HI_HI16: 271 Info.UsesVCC = true; 272 continue; 273 274 case AMDGPU::FLAT_SCR: 275 case AMDGPU::FLAT_SCR_LO: 276 case AMDGPU::FLAT_SCR_HI: 277 continue; 278 279 case AMDGPU::XNACK_MASK: 280 case AMDGPU::XNACK_MASK_LO: 281 case AMDGPU::XNACK_MASK_HI: 282 llvm_unreachable("xnack_mask registers should not be used"); 283 284 case AMDGPU::LDS_DIRECT: 285 llvm_unreachable("lds_direct register should not be used"); 286 287 case AMDGPU::TBA: 288 case AMDGPU::TBA_LO: 289 case AMDGPU::TBA_HI: 290 case AMDGPU::TMA: 291 case AMDGPU::TMA_LO: 292 case AMDGPU::TMA_HI: 293 llvm_unreachable("trap handler registers should not be used"); 294 295 case AMDGPU::SRC_VCCZ: 296 llvm_unreachable("src_vccz register should not be used"); 297 298 case AMDGPU::SRC_EXECZ: 299 llvm_unreachable("src_execz register should not be used"); 300 301 case AMDGPU::SRC_SCC: 302 llvm_unreachable("src_scc register should not be used"); 303 304 default: 305 break; 306 } 307 308 if (AMDGPU::SReg_32RegClass.contains(Reg) || 309 AMDGPU::SReg_LO16RegClass.contains(Reg) || 310 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 311 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 312 "trap handler registers should not be used"); 313 IsSGPR = true; 314 Width = 1; 315 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 316 AMDGPU::VGPR_LO16RegClass.contains(Reg) || 317 AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 318 IsSGPR = false; 319 Width = 1; 320 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 321 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 322 IsSGPR = false; 323 IsAGPR = true; 324 Width = 1; 325 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 326 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 327 "trap handler registers should not be used"); 328 IsSGPR = true; 329 Width = 2; 330 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 331 IsSGPR = false; 332 Width = 2; 333 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 334 IsSGPR = false; 335 IsAGPR = true; 336 Width = 2; 337 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 338 IsSGPR = false; 339 Width = 3; 340 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 341 IsSGPR = true; 342 Width = 3; 343 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 344 IsSGPR = false; 345 IsAGPR = true; 346 Width = 3; 347 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 348 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && 349 "trap handler registers should not be used"); 350 IsSGPR = true; 351 Width = 4; 352 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 353 IsSGPR = false; 354 Width = 4; 355 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 356 IsSGPR = false; 357 IsAGPR = true; 358 Width = 4; 359 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 360 IsSGPR = false; 361 Width = 5; 362 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 363 IsSGPR = true; 364 Width = 5; 365 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 366 IsSGPR = false; 367 IsAGPR = true; 368 Width = 5; 369 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 370 IsSGPR = false; 371 Width = 6; 372 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 373 IsSGPR = true; 374 Width = 6; 375 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 376 IsSGPR = false; 377 IsAGPR = true; 378 Width = 6; 379 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 380 IsSGPR = false; 381 Width = 7; 382 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 383 IsSGPR = true; 384 Width = 7; 385 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 386 IsSGPR = false; 387 IsAGPR = true; 388 Width = 7; 389 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 390 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && 391 "trap handler registers should not be used"); 392 IsSGPR = true; 393 Width = 8; 394 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 395 IsSGPR = false; 396 Width = 8; 397 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 398 IsSGPR = false; 399 IsAGPR = true; 400 Width = 8; 401 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 402 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && 403 "trap handler registers should not be used"); 404 IsSGPR = true; 405 Width = 16; 406 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 407 IsSGPR = false; 408 Width = 16; 409 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 410 IsSGPR = false; 411 IsAGPR = true; 412 Width = 16; 413 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 414 IsSGPR = true; 415 Width = 32; 416 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 417 IsSGPR = false; 418 Width = 32; 419 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 420 IsSGPR = false; 421 IsAGPR = true; 422 Width = 32; 423 } else { 424 llvm_unreachable("Unknown register class"); 425 } 426 unsigned HWReg = TRI.getHWRegIndex(Reg); 427 int MaxUsed = HWReg + Width - 1; 428 if (IsSGPR) { 429 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 430 } else if (IsAGPR) { 431 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 432 } else { 433 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 434 } 435 } 436 437 if (MI.isCall()) { 438 // Pseudo used just to encode the underlying global. Is there a better 439 // way to track this? 440 441 const MachineOperand *CalleeOp = 442 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 443 444 const Function *Callee = getCalleeFunction(*CalleeOp); 445 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 446 CallGraphResourceInfo.end(); 447 448 // Avoid crashing on undefined behavior with an illegal call to a 449 // kernel. If a callsite's calling convention doesn't match the 450 // function's, it's undefined behavior. If the callsite calling 451 // convention does match, that would have errored earlier. 452 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 453 report_fatal_error("invalid call to entry function"); 454 455 bool IsIndirect = !Callee || Callee->isDeclaration(); 456 if (!IsIndirect) 457 I = CallGraphResourceInfo.find(Callee); 458 459 // FIXME: Call site could have norecurse on it 460 if (!Callee || !Callee->doesNotRecurse()) { 461 Info.HasRecursion = true; 462 463 // TODO: If we happen to know there is no stack usage in the 464 // callgraph, we don't need to assume an infinitely growing stack. 465 if (!MI.isReturn()) { 466 // We don't need to assume an unknown stack size for tail calls. 467 468 // FIXME: This only benefits in the case where the kernel does not 469 // directly call the tail called function. If a kernel directly 470 // calls a tail recursive function, we'll assume maximum stack size 471 // based on the regular call instruction. 472 CalleeFrameSize = 473 std::max(CalleeFrameSize, 474 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 475 } 476 } 477 478 if (IsIndirect || I == CallGraphResourceInfo.end()) { 479 CalleeFrameSize = 480 std::max(CalleeFrameSize, 481 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 482 483 // Register usage of indirect calls gets handled later 484 Info.UsesVCC = true; 485 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 486 Info.HasDynamicallySizedStack = true; 487 Info.HasIndirectCall = true; 488 } else { 489 // We force CodeGen to run in SCC order, so the callee's register 490 // usage etc. should be the cumulative usage of all callees. 491 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 492 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 493 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 494 CalleeFrameSize = 495 std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 496 Info.UsesVCC |= I->second.UsesVCC; 497 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 498 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 499 Info.HasRecursion |= I->second.HasRecursion; 500 Info.HasIndirectCall |= I->second.HasIndirectCall; 501 } 502 } 503 } 504 } 505 506 Info.NumExplicitSGPR = MaxSGPR + 1; 507 Info.NumVGPR = MaxVGPR + 1; 508 Info.NumAGPR = MaxAGPR + 1; 509 Info.PrivateSegmentSize += CalleeFrameSize; 510 511 return Info; 512 } 513 514 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 515 // Collect the maximum number of registers from non-hardware-entrypoints. 516 // All these functions are potential targets for indirect calls. 517 int32_t NonKernelMaxSGPRs = 0; 518 int32_t NonKernelMaxVGPRs = 0; 519 int32_t NonKernelMaxAGPRs = 0; 520 521 for (const auto &I : CallGraphResourceInfo) { 522 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 523 auto &Info = I.getSecond(); 524 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 525 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 526 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 527 } 528 } 529 530 // Add register usage for functions with indirect calls. 531 // For calls to unknown functions, we assume the maximum register usage of 532 // all non-hardware-entrypoints in the current module. 533 for (auto &I : CallGraphResourceInfo) { 534 auto &Info = I.getSecond(); 535 if (Info.HasIndirectCall) { 536 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 537 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 538 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 539 } 540 } 541 } 542