1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 /// The analysis takes callees into account. E.g. if a function A that needs 10 17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18 /// will return 20. 19 /// It is assumed that an indirect call can go into any function except 20 /// hardware-entrypoints. Therefore the register usage of functions with 21 /// indirect calls is estimated as the maximum of all non-entrypoint functions 22 /// in the module. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPUResourceUsageAnalysis.h" 27 #include "AMDGPU.h" 28 #include "GCNSubtarget.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/Analysis/CallGraph.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/TargetPassConfig.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/Target/TargetMachine.h" 37 38 using namespace llvm; 39 using namespace llvm::AMDGPU; 40 41 #define DEBUG_TYPE "amdgpu-resource-usage" 42 43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 45 46 // In code object v4 and older, we need to tell the runtime some amount ahead of 47 // time if we don't know the true stack size. Assume a smaller number if this is 48 // only due to dynamic / non-entry block allocas. 49 static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 50 "amdgpu-assume-external-call-stack-size", 51 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 52 cl::init(16384)); 53 54 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 55 "amdgpu-assume-dynamic-stack-object-size", 56 cl::desc("Assumed extra stack use if there are any " 57 "variable sized objects (in bytes)"), 58 cl::Hidden, cl::init(4096)); 59 60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 61 "Function register usage analysis", true, true) 62 63 static const Function *getCalleeFunction(const MachineOperand &Op) { 64 if (Op.isImm()) { 65 assert(Op.getImm() == 0); 66 return nullptr; 67 } 68 if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal())) 69 return cast<Function>(GA->getOperand(0)); 70 return cast<Function>(Op.getGlobal()); 71 } 72 73 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 74 const SIInstrInfo &TII, unsigned Reg) { 75 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 76 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 77 return true; 78 } 79 80 return false; 81 } 82 83 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 84 const GCNSubtarget &ST) const { 85 return NumExplicitSGPR + 86 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 87 ST.getTargetID().isXnackOnOrAny()); 88 } 89 90 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 91 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { 92 return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); 93 } 94 95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 96 const GCNSubtarget &ST) const { 97 return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); 98 } 99 100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { 101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 102 if (!TPC) 103 return false; 104 105 MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 106 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 107 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); 108 bool HasIndirectCall = false; 109 110 CallGraph CG = CallGraph(M); 111 auto End = po_end(&CG); 112 113 // By default, for code object v5 and later, track only the minimum scratch 114 // size 115 if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || 116 STI.getTargetTriple().getOS() == Triple::AMDPAL) { 117 if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) 118 AssumedStackSizeForDynamicSizeObjects = 0; 119 if (!AssumedStackSizeForExternalCall.getNumOccurrences()) 120 AssumedStackSizeForExternalCall = 0; 121 } 122 123 for (auto IT = po_begin(&CG); IT != End; ++IT) { 124 Function *F = IT->getFunction(); 125 if (!F || F->isDeclaration()) 126 continue; 127 128 MachineFunction *MF = MMI.getMachineFunction(*F); 129 assert(MF && "function must have been generated already"); 130 131 auto CI = 132 CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); 133 SIFunctionResourceInfo &Info = CI.first->second; 134 assert(CI.second && "should only be called once per function"); 135 Info = analyzeResourceUsage(*MF, TM); 136 HasIndirectCall |= Info.HasIndirectCall; 137 } 138 139 // It's possible we have unreachable functions in the module which weren't 140 // visited by the PO traversal. Make sure we have some resource counts to 141 // report. 142 for (const auto &IT : CG) { 143 const Function *F = IT.first; 144 if (!F || F->isDeclaration()) 145 continue; 146 147 auto CI = 148 CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); 149 if (!CI.second) // Skip already visited functions 150 continue; 151 152 SIFunctionResourceInfo &Info = CI.first->second; 153 MachineFunction *MF = MMI.getMachineFunction(*F); 154 assert(MF && "function must have been generated already"); 155 Info = analyzeResourceUsage(*MF, TM); 156 HasIndirectCall |= Info.HasIndirectCall; 157 } 158 159 if (HasIndirectCall) 160 propagateIndirectCallRegisterUsage(); 161 162 return false; 163 } 164 165 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 166 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 167 const MachineFunction &MF, const TargetMachine &TM) const { 168 SIFunctionResourceInfo Info; 169 170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 171 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 172 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 173 const MachineRegisterInfo &MRI = MF.getRegInfo(); 174 const SIInstrInfo *TII = ST.getInstrInfo(); 175 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 176 177 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 178 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 179 MRI.isLiveIn(MFI->getPreloadedReg( 180 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 181 182 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 183 // instructions aren't used to access the scratch buffer. Inline assembly may 184 // need it though. 185 // 186 // If we only have implicit uses of flat_scr on flat instructions, it is not 187 // really needed. 188 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 189 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 190 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 191 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 192 Info.UsesFlatScratch = false; 193 } 194 195 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 196 197 // Assume a big number if there are any unknown sized objects. 198 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 199 if (Info.HasDynamicallySizedStack) 200 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 201 202 if (MFI->isStackRealigned()) 203 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 204 205 Info.UsesVCC = 206 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 207 208 // If there are no calls, MachineRegisterInfo can tell us the used register 209 // count easily. 210 // A tail call isn't considered a call for MachineFrameInfo's purposes. 211 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 212 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 213 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 214 if (MRI.isPhysRegUsed(Reg)) { 215 HighestVGPRReg = Reg; 216 break; 217 } 218 } 219 220 if (ST.hasMAIInsts()) { 221 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 222 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 223 if (MRI.isPhysRegUsed(Reg)) { 224 HighestAGPRReg = Reg; 225 break; 226 } 227 } 228 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 229 ? 0 230 : TRI.getHWRegIndex(HighestAGPRReg) + 1; 231 } 232 233 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 234 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 235 if (MRI.isPhysRegUsed(Reg)) { 236 HighestSGPRReg = Reg; 237 break; 238 } 239 } 240 241 // We found the maximum register index. They start at 0, so add one to get 242 // the number of registers. 243 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 244 ? 0 245 : TRI.getHWRegIndex(HighestVGPRReg) + 1; 246 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 247 ? 0 248 : TRI.getHWRegIndex(HighestSGPRReg) + 1; 249 250 return Info; 251 } 252 253 int32_t MaxVGPR = -1; 254 int32_t MaxAGPR = -1; 255 int32_t MaxSGPR = -1; 256 uint64_t CalleeFrameSize = 0; 257 258 for (const MachineBasicBlock &MBB : MF) { 259 for (const MachineInstr &MI : MBB) { 260 // TODO: Check regmasks? Do they occur anywhere except calls? 261 for (const MachineOperand &MO : MI.operands()) { 262 unsigned Width = 0; 263 bool IsSGPR = false; 264 bool IsAGPR = false; 265 266 if (!MO.isReg()) 267 continue; 268 269 Register Reg = MO.getReg(); 270 switch (Reg) { 271 case AMDGPU::EXEC: 272 case AMDGPU::EXEC_LO: 273 case AMDGPU::EXEC_HI: 274 case AMDGPU::SCC: 275 case AMDGPU::M0: 276 case AMDGPU::M0_LO16: 277 case AMDGPU::M0_HI16: 278 case AMDGPU::SRC_SHARED_BASE_LO: 279 case AMDGPU::SRC_SHARED_BASE: 280 case AMDGPU::SRC_SHARED_LIMIT_LO: 281 case AMDGPU::SRC_SHARED_LIMIT: 282 case AMDGPU::SRC_PRIVATE_BASE_LO: 283 case AMDGPU::SRC_PRIVATE_BASE: 284 case AMDGPU::SRC_PRIVATE_LIMIT_LO: 285 case AMDGPU::SRC_PRIVATE_LIMIT: 286 case AMDGPU::SGPR_NULL: 287 case AMDGPU::SGPR_NULL64: 288 case AMDGPU::MODE: 289 continue; 290 291 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 292 llvm_unreachable("src_pops_exiting_wave_id should not be used"); 293 294 case AMDGPU::NoRegister: 295 assert(MI.isDebugInstr() && 296 "Instruction uses invalid noreg register"); 297 continue; 298 299 case AMDGPU::VCC: 300 case AMDGPU::VCC_LO: 301 case AMDGPU::VCC_HI: 302 case AMDGPU::VCC_LO_LO16: 303 case AMDGPU::VCC_LO_HI16: 304 case AMDGPU::VCC_HI_LO16: 305 case AMDGPU::VCC_HI_HI16: 306 Info.UsesVCC = true; 307 continue; 308 309 case AMDGPU::FLAT_SCR: 310 case AMDGPU::FLAT_SCR_LO: 311 case AMDGPU::FLAT_SCR_HI: 312 continue; 313 314 case AMDGPU::XNACK_MASK: 315 case AMDGPU::XNACK_MASK_LO: 316 case AMDGPU::XNACK_MASK_HI: 317 llvm_unreachable("xnack_mask registers should not be used"); 318 319 case AMDGPU::LDS_DIRECT: 320 llvm_unreachable("lds_direct register should not be used"); 321 322 case AMDGPU::TBA: 323 case AMDGPU::TBA_LO: 324 case AMDGPU::TBA_HI: 325 case AMDGPU::TMA: 326 case AMDGPU::TMA_LO: 327 case AMDGPU::TMA_HI: 328 llvm_unreachable("trap handler registers should not be used"); 329 330 case AMDGPU::SRC_VCCZ: 331 llvm_unreachable("src_vccz register should not be used"); 332 333 case AMDGPU::SRC_EXECZ: 334 llvm_unreachable("src_execz register should not be used"); 335 336 case AMDGPU::SRC_SCC: 337 llvm_unreachable("src_scc register should not be used"); 338 339 default: 340 break; 341 } 342 343 if (AMDGPU::SGPR_32RegClass.contains(Reg) || 344 AMDGPU::SGPR_LO16RegClass.contains(Reg) || 345 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 346 IsSGPR = true; 347 Width = 1; 348 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 349 AMDGPU::VGPR_LO16RegClass.contains(Reg) || 350 AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 351 IsSGPR = false; 352 Width = 1; 353 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 354 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 355 IsSGPR = false; 356 IsAGPR = true; 357 Width = 1; 358 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { 359 IsSGPR = true; 360 Width = 2; 361 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 362 IsSGPR = false; 363 Width = 2; 364 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 365 IsSGPR = false; 366 IsAGPR = true; 367 Width = 2; 368 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 369 IsSGPR = false; 370 Width = 3; 371 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 372 IsSGPR = true; 373 Width = 3; 374 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 375 IsSGPR = false; 376 IsAGPR = true; 377 Width = 3; 378 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { 379 IsSGPR = true; 380 Width = 4; 381 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 382 IsSGPR = false; 383 Width = 4; 384 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 385 IsSGPR = false; 386 IsAGPR = true; 387 Width = 4; 388 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 389 IsSGPR = false; 390 Width = 5; 391 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 392 IsSGPR = true; 393 Width = 5; 394 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 395 IsSGPR = false; 396 IsAGPR = true; 397 Width = 5; 398 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 399 IsSGPR = false; 400 Width = 6; 401 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 402 IsSGPR = true; 403 Width = 6; 404 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 405 IsSGPR = false; 406 IsAGPR = true; 407 Width = 6; 408 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 409 IsSGPR = false; 410 Width = 7; 411 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 412 IsSGPR = true; 413 Width = 7; 414 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 415 IsSGPR = false; 416 IsAGPR = true; 417 Width = 7; 418 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 419 IsSGPR = true; 420 Width = 8; 421 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 422 IsSGPR = false; 423 Width = 8; 424 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 425 IsSGPR = false; 426 IsAGPR = true; 427 Width = 8; 428 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { 429 IsSGPR = false; 430 Width = 9; 431 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { 432 IsSGPR = true; 433 Width = 9; 434 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { 435 IsSGPR = false; 436 IsAGPR = true; 437 Width = 9; 438 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { 439 IsSGPR = false; 440 Width = 10; 441 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { 442 IsSGPR = true; 443 Width = 10; 444 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { 445 IsSGPR = false; 446 IsAGPR = true; 447 Width = 10; 448 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { 449 IsSGPR = false; 450 Width = 11; 451 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { 452 IsSGPR = true; 453 Width = 11; 454 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { 455 IsSGPR = false; 456 IsAGPR = true; 457 Width = 11; 458 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { 459 IsSGPR = false; 460 Width = 12; 461 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { 462 IsSGPR = true; 463 Width = 12; 464 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { 465 IsSGPR = false; 466 IsAGPR = true; 467 Width = 12; 468 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 469 IsSGPR = true; 470 Width = 16; 471 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 472 IsSGPR = false; 473 Width = 16; 474 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 475 IsSGPR = false; 476 IsAGPR = true; 477 Width = 16; 478 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 479 IsSGPR = true; 480 Width = 32; 481 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 482 IsSGPR = false; 483 Width = 32; 484 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 485 IsSGPR = false; 486 IsAGPR = true; 487 Width = 32; 488 } else { 489 // We only expect TTMP registers or registers that do not belong to 490 // any RC. 491 assert((AMDGPU::TTMP_32RegClass.contains(Reg) || 492 AMDGPU::TTMP_64RegClass.contains(Reg) || 493 AMDGPU::TTMP_128RegClass.contains(Reg) || 494 AMDGPU::TTMP_256RegClass.contains(Reg) || 495 AMDGPU::TTMP_512RegClass.contains(Reg) || 496 !TRI.getPhysRegBaseClass(Reg)) && 497 "Unknown register class"); 498 } 499 unsigned HWReg = TRI.getHWRegIndex(Reg); 500 int MaxUsed = HWReg + Width - 1; 501 if (IsSGPR) { 502 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 503 } else if (IsAGPR) { 504 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 505 } else { 506 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 507 } 508 } 509 510 if (MI.isCall()) { 511 // Pseudo used just to encode the underlying global. Is there a better 512 // way to track this? 513 514 const MachineOperand *CalleeOp = 515 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 516 517 const Function *Callee = getCalleeFunction(*CalleeOp); 518 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 519 CallGraphResourceInfo.end(); 520 521 // Avoid crashing on undefined behavior with an illegal call to a 522 // kernel. If a callsite's calling convention doesn't match the 523 // function's, it's undefined behavior. If the callsite calling 524 // convention does match, that would have errored earlier. 525 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 526 report_fatal_error("invalid call to entry function"); 527 528 bool IsIndirect = !Callee || Callee->isDeclaration(); 529 if (!IsIndirect) 530 I = CallGraphResourceInfo.find(Callee); 531 532 // FIXME: Call site could have norecurse on it 533 if (!Callee || !Callee->doesNotRecurse()) { 534 Info.HasRecursion = true; 535 536 // TODO: If we happen to know there is no stack usage in the 537 // callgraph, we don't need to assume an infinitely growing stack. 538 if (!MI.isReturn()) { 539 // We don't need to assume an unknown stack size for tail calls. 540 541 // FIXME: This only benefits in the case where the kernel does not 542 // directly call the tail called function. If a kernel directly 543 // calls a tail recursive function, we'll assume maximum stack size 544 // based on the regular call instruction. 545 CalleeFrameSize = 546 std::max(CalleeFrameSize, 547 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 548 } 549 } 550 551 if (IsIndirect || I == CallGraphResourceInfo.end()) { 552 CalleeFrameSize = 553 std::max(CalleeFrameSize, 554 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 555 556 // Register usage of indirect calls gets handled later 557 Info.UsesVCC = true; 558 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 559 Info.HasDynamicallySizedStack = true; 560 Info.HasIndirectCall = true; 561 } else { 562 // We force CodeGen to run in SCC order, so the callee's register 563 // usage etc. should be the cumulative usage of all callees. 564 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 565 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 566 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 567 CalleeFrameSize = 568 std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 569 Info.UsesVCC |= I->second.UsesVCC; 570 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 571 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 572 Info.HasRecursion |= I->second.HasRecursion; 573 Info.HasIndirectCall |= I->second.HasIndirectCall; 574 } 575 } 576 } 577 } 578 579 Info.NumExplicitSGPR = MaxSGPR + 1; 580 Info.NumVGPR = MaxVGPR + 1; 581 Info.NumAGPR = MaxAGPR + 1; 582 Info.PrivateSegmentSize += CalleeFrameSize; 583 584 return Info; 585 } 586 587 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 588 // Collect the maximum number of registers from non-hardware-entrypoints. 589 // All these functions are potential targets for indirect calls. 590 int32_t NonKernelMaxSGPRs = 0; 591 int32_t NonKernelMaxVGPRs = 0; 592 int32_t NonKernelMaxAGPRs = 0; 593 594 for (const auto &I : CallGraphResourceInfo) { 595 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 596 auto &Info = I.getSecond(); 597 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 598 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 599 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 600 } 601 } 602 603 // Add register usage for functions with indirect calls. 604 // For calls to unknown functions, we assume the maximum register usage of 605 // all non-hardware-entrypoints in the current module. 606 for (auto &I : CallGraphResourceInfo) { 607 auto &Info = I.getSecond(); 608 if (Info.HasIndirectCall) { 609 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 610 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 611 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 612 } 613 } 614 } 615