1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 /// The analysis takes callees into account. E.g. if a function A that needs 10 17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18 /// will return 20. 19 /// It is assumed that an indirect call can go into any function except 20 /// hardware-entrypoints. Therefore the register usage of functions with 21 /// indirect calls is estimated as the maximum of all non-entrypoint functions 22 /// in the module. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPUResourceUsageAnalysis.h" 27 #include "AMDGPU.h" 28 #include "GCNSubtarget.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/Analysis/CallGraph.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/TargetPassConfig.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/Target/TargetMachine.h" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 #define DEBUG_TYPE "amdgpu-resource-usage" 42 43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 45 46 // In code object v4 and older, we need to tell the runtime some amount ahead of 47 // time if we don't know the true stack size. Assume a smaller number if this is 48 // only due to dynamic / non-entry block allocas. 49 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( 50 "amdgpu-assume-external-call-stack-size", 51 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 52 cl::init(16384)); 53 54 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( 55 "amdgpu-assume-dynamic-stack-object-size", 56 cl::desc("Assumed extra stack use if there are any " 57 "variable sized objects (in bytes)"), 58 cl::Hidden, cl::init(4096)); 59 60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 61 "Function register usage analysis", true, true) 62 63 static const Function *getCalleeFunction(const MachineOperand &Op) { 64 if (Op.isImm()) { 65 assert(Op.getImm() == 0); 66 return nullptr; 67 } 68 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases()); 69 } 70 71 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 72 const SIInstrInfo &TII, unsigned Reg) { 73 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 74 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 75 return true; 76 } 77 78 return false; 79 } 80 81 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 82 const GCNSubtarget &ST) const { 83 return NumExplicitSGPR + 84 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 85 ST.getTargetID().isXnackOnOrAny()); 86 } 87 88 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 89 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { 90 return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); 91 } 92 93 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 94 const GCNSubtarget &ST) const { 95 return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); 96 } 97 98 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { 99 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 100 if (!TPC) 101 return false; 102 103 MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 104 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 105 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); 106 bool HasIndirectCall = false; 107 108 CallGraph CG = CallGraph(M); 109 auto End = po_end(&CG); 110 111 // By default, for code object v5 and later, track only the minimum scratch 112 // size 113 uint32_t AssumedStackSizeForDynamicSizeObjects = 114 clAssumedStackSizeForDynamicSizeObjects; 115 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; 116 if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || 117 STI.getTargetTriple().getOS() == Triple::AMDPAL) { 118 if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0) 119 AssumedStackSizeForDynamicSizeObjects = 0; 120 if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0) 121 AssumedStackSizeForExternalCall = 0; 122 } 123 124 for (auto IT = po_begin(&CG); IT != End; ++IT) { 125 Function *F = IT->getFunction(); 126 if (!F || F->isDeclaration()) 127 continue; 128 129 MachineFunction *MF = MMI.getMachineFunction(*F); 130 assert(MF && "function must have been generated already"); 131 132 auto CI = 133 CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); 134 SIFunctionResourceInfo &Info = CI.first->second; 135 assert(CI.second && "should only be called once per function"); 136 Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, 137 AssumedStackSizeForExternalCall); 138 HasIndirectCall |= Info.HasIndirectCall; 139 } 140 141 // It's possible we have unreachable functions in the module which weren't 142 // visited by the PO traversal. Make sure we have some resource counts to 143 // report. 144 for (const auto &IT : CG) { 145 const Function *F = IT.first; 146 if (!F || F->isDeclaration()) 147 continue; 148 149 auto CI = 150 CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); 151 if (!CI.second) // Skip already visited functions 152 continue; 153 154 SIFunctionResourceInfo &Info = CI.first->second; 155 MachineFunction *MF = MMI.getMachineFunction(*F); 156 assert(MF && "function must have been generated already"); 157 Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, 158 AssumedStackSizeForExternalCall); 159 HasIndirectCall |= Info.HasIndirectCall; 160 } 161 162 if (HasIndirectCall) 163 propagateIndirectCallRegisterUsage(); 164 165 return false; 166 } 167 168 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 169 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 170 const MachineFunction &MF, const TargetMachine &TM, 171 uint32_t AssumedStackSizeForDynamicSizeObjects, 172 uint32_t AssumedStackSizeForExternalCall) const { 173 SIFunctionResourceInfo Info; 174 175 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 176 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 177 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 178 const MachineRegisterInfo &MRI = MF.getRegInfo(); 179 const SIInstrInfo *TII = ST.getInstrInfo(); 180 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 181 182 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 183 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 184 MRI.isLiveIn(MFI->getPreloadedReg( 185 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 186 187 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 188 // instructions aren't used to access the scratch buffer. Inline assembly may 189 // need it though. 190 // 191 // If we only have implicit uses of flat_scr on flat instructions, it is not 192 // really needed. 193 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && 194 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 195 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 196 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 197 Info.UsesFlatScratch = false; 198 } 199 200 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 201 202 // Assume a big number if there are any unknown sized objects. 203 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 204 if (Info.HasDynamicallySizedStack) 205 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 206 207 if (MFI->isStackRealigned()) 208 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 209 210 Info.UsesVCC = 211 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 212 213 // If there are no calls, MachineRegisterInfo can tell us the used register 214 // count easily. 215 // A tail call isn't considered a call for MachineFrameInfo's purposes. 216 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 217 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 218 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 219 if (MRI.isPhysRegUsed(Reg)) { 220 HighestVGPRReg = Reg; 221 break; 222 } 223 } 224 225 if (ST.hasMAIInsts()) { 226 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 227 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 228 if (MRI.isPhysRegUsed(Reg)) { 229 HighestAGPRReg = Reg; 230 break; 231 } 232 } 233 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 234 ? 0 235 : TRI.getHWRegIndex(HighestAGPRReg) + 1; 236 } 237 238 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 239 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 240 if (MRI.isPhysRegUsed(Reg)) { 241 HighestSGPRReg = Reg; 242 break; 243 } 244 } 245 246 // We found the maximum register index. They start at 0, so add one to get 247 // the number of registers. 248 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 249 ? 0 250 : TRI.getHWRegIndex(HighestVGPRReg) + 1; 251 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 252 ? 0 253 : TRI.getHWRegIndex(HighestSGPRReg) + 1; 254 255 return Info; 256 } 257 258 int32_t MaxVGPR = -1; 259 int32_t MaxAGPR = -1; 260 int32_t MaxSGPR = -1; 261 uint64_t CalleeFrameSize = 0; 262 263 for (const MachineBasicBlock &MBB : MF) { 264 for (const MachineInstr &MI : MBB) { 265 // TODO: Check regmasks? Do they occur anywhere except calls? 266 for (const MachineOperand &MO : MI.operands()) { 267 unsigned Width = 0; 268 bool IsSGPR = false; 269 bool IsAGPR = false; 270 271 if (!MO.isReg()) 272 continue; 273 274 Register Reg = MO.getReg(); 275 switch (Reg) { 276 case AMDGPU::EXEC: 277 case AMDGPU::EXEC_LO: 278 case AMDGPU::EXEC_HI: 279 case AMDGPU::SCC: 280 case AMDGPU::M0: 281 case AMDGPU::M0_LO16: 282 case AMDGPU::M0_HI16: 283 case AMDGPU::SRC_SHARED_BASE_LO: 284 case AMDGPU::SRC_SHARED_BASE: 285 case AMDGPU::SRC_SHARED_LIMIT_LO: 286 case AMDGPU::SRC_SHARED_LIMIT: 287 case AMDGPU::SRC_PRIVATE_BASE_LO: 288 case AMDGPU::SRC_PRIVATE_BASE: 289 case AMDGPU::SRC_PRIVATE_LIMIT_LO: 290 case AMDGPU::SRC_PRIVATE_LIMIT: 291 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 292 case AMDGPU::SGPR_NULL: 293 case AMDGPU::SGPR_NULL64: 294 case AMDGPU::MODE: 295 continue; 296 297 case AMDGPU::NoRegister: 298 assert(MI.isDebugInstr() && 299 "Instruction uses invalid noreg register"); 300 continue; 301 302 case AMDGPU::VCC: 303 case AMDGPU::VCC_LO: 304 case AMDGPU::VCC_HI: 305 case AMDGPU::VCC_LO_LO16: 306 case AMDGPU::VCC_LO_HI16: 307 case AMDGPU::VCC_HI_LO16: 308 case AMDGPU::VCC_HI_HI16: 309 Info.UsesVCC = true; 310 continue; 311 312 case AMDGPU::FLAT_SCR: 313 case AMDGPU::FLAT_SCR_LO: 314 case AMDGPU::FLAT_SCR_HI: 315 continue; 316 317 case AMDGPU::XNACK_MASK: 318 case AMDGPU::XNACK_MASK_LO: 319 case AMDGPU::XNACK_MASK_HI: 320 llvm_unreachable("xnack_mask registers should not be used"); 321 322 case AMDGPU::LDS_DIRECT: 323 llvm_unreachable("lds_direct register should not be used"); 324 325 case AMDGPU::TBA: 326 case AMDGPU::TBA_LO: 327 case AMDGPU::TBA_HI: 328 case AMDGPU::TMA: 329 case AMDGPU::TMA_LO: 330 case AMDGPU::TMA_HI: 331 llvm_unreachable("trap handler registers should not be used"); 332 333 case AMDGPU::SRC_VCCZ: 334 llvm_unreachable("src_vccz register should not be used"); 335 336 case AMDGPU::SRC_EXECZ: 337 llvm_unreachable("src_execz register should not be used"); 338 339 case AMDGPU::SRC_SCC: 340 llvm_unreachable("src_scc register should not be used"); 341 342 default: 343 break; 344 } 345 346 if (AMDGPU::SGPR_32RegClass.contains(Reg) || 347 AMDGPU::SGPR_LO16RegClass.contains(Reg) || 348 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 349 IsSGPR = true; 350 Width = 1; 351 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 352 AMDGPU::VGPR_16RegClass.contains(Reg)) { 353 IsSGPR = false; 354 Width = 1; 355 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 356 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 357 IsSGPR = false; 358 IsAGPR = true; 359 Width = 1; 360 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { 361 IsSGPR = true; 362 Width = 2; 363 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 364 IsSGPR = false; 365 Width = 2; 366 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 367 IsSGPR = false; 368 IsAGPR = true; 369 Width = 2; 370 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 371 IsSGPR = false; 372 Width = 3; 373 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 374 IsSGPR = true; 375 Width = 3; 376 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 377 IsSGPR = false; 378 IsAGPR = true; 379 Width = 3; 380 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { 381 IsSGPR = true; 382 Width = 4; 383 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 384 IsSGPR = false; 385 Width = 4; 386 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 387 IsSGPR = false; 388 IsAGPR = true; 389 Width = 4; 390 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 391 IsSGPR = false; 392 Width = 5; 393 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 394 IsSGPR = true; 395 Width = 5; 396 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 397 IsSGPR = false; 398 IsAGPR = true; 399 Width = 5; 400 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 401 IsSGPR = false; 402 Width = 6; 403 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 404 IsSGPR = true; 405 Width = 6; 406 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 407 IsSGPR = false; 408 IsAGPR = true; 409 Width = 6; 410 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 411 IsSGPR = false; 412 Width = 7; 413 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 414 IsSGPR = true; 415 Width = 7; 416 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 417 IsSGPR = false; 418 IsAGPR = true; 419 Width = 7; 420 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 421 IsSGPR = true; 422 Width = 8; 423 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 424 IsSGPR = false; 425 Width = 8; 426 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 427 IsSGPR = false; 428 IsAGPR = true; 429 Width = 8; 430 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { 431 IsSGPR = false; 432 Width = 9; 433 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { 434 IsSGPR = true; 435 Width = 9; 436 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { 437 IsSGPR = false; 438 IsAGPR = true; 439 Width = 9; 440 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { 441 IsSGPR = false; 442 Width = 10; 443 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { 444 IsSGPR = true; 445 Width = 10; 446 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { 447 IsSGPR = false; 448 IsAGPR = true; 449 Width = 10; 450 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { 451 IsSGPR = false; 452 Width = 11; 453 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { 454 IsSGPR = true; 455 Width = 11; 456 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { 457 IsSGPR = false; 458 IsAGPR = true; 459 Width = 11; 460 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { 461 IsSGPR = false; 462 Width = 12; 463 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { 464 IsSGPR = true; 465 Width = 12; 466 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { 467 IsSGPR = false; 468 IsAGPR = true; 469 Width = 12; 470 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 471 IsSGPR = true; 472 Width = 16; 473 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 474 IsSGPR = false; 475 Width = 16; 476 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 477 IsSGPR = false; 478 IsAGPR = true; 479 Width = 16; 480 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 481 IsSGPR = true; 482 Width = 32; 483 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 484 IsSGPR = false; 485 Width = 32; 486 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 487 IsSGPR = false; 488 IsAGPR = true; 489 Width = 32; 490 } else { 491 // We only expect TTMP registers or registers that do not belong to 492 // any RC. 493 assert((AMDGPU::TTMP_32RegClass.contains(Reg) || 494 AMDGPU::TTMP_64RegClass.contains(Reg) || 495 AMDGPU::TTMP_128RegClass.contains(Reg) || 496 AMDGPU::TTMP_256RegClass.contains(Reg) || 497 AMDGPU::TTMP_512RegClass.contains(Reg) || 498 !TRI.getPhysRegBaseClass(Reg)) && 499 "Unknown register class"); 500 } 501 unsigned HWReg = TRI.getHWRegIndex(Reg); 502 int MaxUsed = HWReg + Width - 1; 503 if (IsSGPR) { 504 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 505 } else if (IsAGPR) { 506 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 507 } else { 508 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 509 } 510 } 511 512 if (MI.isCall()) { 513 // Pseudo used just to encode the underlying global. Is there a better 514 // way to track this? 515 516 const MachineOperand *CalleeOp = 517 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 518 519 const Function *Callee = getCalleeFunction(*CalleeOp); 520 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 521 CallGraphResourceInfo.end(); 522 523 // Avoid crashing on undefined behavior with an illegal call to a 524 // kernel. If a callsite's calling convention doesn't match the 525 // function's, it's undefined behavior. If the callsite calling 526 // convention does match, that would have errored earlier. 527 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 528 report_fatal_error("invalid call to entry function"); 529 530 bool IsIndirect = !Callee || Callee->isDeclaration(); 531 if (!IsIndirect) 532 I = CallGraphResourceInfo.find(Callee); 533 534 // FIXME: Call site could have norecurse on it 535 if (!Callee || !Callee->doesNotRecurse()) { 536 Info.HasRecursion = true; 537 538 // TODO: If we happen to know there is no stack usage in the 539 // callgraph, we don't need to assume an infinitely growing stack. 540 if (!MI.isReturn()) { 541 // We don't need to assume an unknown stack size for tail calls. 542 543 // FIXME: This only benefits in the case where the kernel does not 544 // directly call the tail called function. If a kernel directly 545 // calls a tail recursive function, we'll assume maximum stack size 546 // based on the regular call instruction. 547 CalleeFrameSize = std::max( 548 CalleeFrameSize, 549 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 550 } 551 } 552 553 if (IsIndirect || I == CallGraphResourceInfo.end()) { 554 CalleeFrameSize = 555 std::max(CalleeFrameSize, 556 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 557 558 // Register usage of indirect calls gets handled later 559 Info.UsesVCC = true; 560 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 561 Info.HasDynamicallySizedStack = true; 562 Info.HasIndirectCall = true; 563 } else { 564 // We force CodeGen to run in SCC order, so the callee's register 565 // usage etc. should be the cumulative usage of all callees. 566 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 567 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 568 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 569 CalleeFrameSize = 570 std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 571 Info.UsesVCC |= I->second.UsesVCC; 572 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 573 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 574 Info.HasRecursion |= I->second.HasRecursion; 575 Info.HasIndirectCall |= I->second.HasIndirectCall; 576 } 577 } 578 } 579 } 580 581 Info.NumExplicitSGPR = MaxSGPR + 1; 582 Info.NumVGPR = MaxVGPR + 1; 583 Info.NumAGPR = MaxAGPR + 1; 584 Info.PrivateSegmentSize += CalleeFrameSize; 585 586 return Info; 587 } 588 589 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 590 // Collect the maximum number of registers from non-hardware-entrypoints. 591 // All these functions are potential targets for indirect calls. 592 int32_t NonKernelMaxSGPRs = 0; 593 int32_t NonKernelMaxVGPRs = 0; 594 int32_t NonKernelMaxAGPRs = 0; 595 596 for (const auto &I : CallGraphResourceInfo) { 597 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 598 auto &Info = I.getSecond(); 599 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 600 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 601 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 602 } 603 } 604 605 // Add register usage for functions with indirect calls. 606 // For calls to unknown functions, we assume the maximum register usage of 607 // all non-hardware-entrypoints in the current module. 608 for (auto &I : CallGraphResourceInfo) { 609 auto &Info = I.getSecond(); 610 if (Info.HasIndirectCall) { 611 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 612 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 613 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 614 } 615 } 616 } 617