1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 /// The analysis takes callees into account. E.g. if a function A that needs 10 17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18 /// will return 20. 19 /// It is assumed that an indirect call can go into any function except 20 /// hardware-entrypoints. Therefore the register usage of functions with 21 /// indirect calls is estimated as the maximum of all non-entrypoint functions 22 /// in the module. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPUResourceUsageAnalysis.h" 27 #include "AMDGPU.h" 28 #include "GCNSubtarget.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "llvm/Analysis/CallGraph.h" 31 #include "llvm/CodeGen/TargetPassConfig.h" 32 #include "llvm/Target/TargetMachine.h" 33 34 using namespace llvm; 35 using namespace llvm::AMDGPU; 36 37 #define DEBUG_TYPE "amdgpu-resource-usage" 38 39 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 40 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 41 42 // We need to tell the runtime some amount ahead of time if we don't know the 43 // true stack size. Assume a smaller number if this is only due to dynamic / 44 // non-entry block allocas. 45 static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 46 "amdgpu-assume-external-call-stack-size", 47 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 48 cl::init(16384)); 49 50 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 51 "amdgpu-assume-dynamic-stack-object-size", 52 cl::desc("Assumed extra stack use if there are any " 53 "variable sized objects (in bytes)"), 54 cl::Hidden, cl::init(4096)); 55 56 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 57 "Function register usage analysis", true, true) 58 59 static const Function *getCalleeFunction(const MachineOperand &Op) { 60 if (Op.isImm()) { 61 assert(Op.getImm() == 0); 62 return nullptr; 63 } 64 65 return cast<Function>(Op.getGlobal()); 66 } 67 68 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 69 const SIInstrInfo &TII, unsigned Reg) { 70 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 71 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 72 return true; 73 } 74 75 return false; 76 } 77 78 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 79 const GCNSubtarget &ST) const { 80 return NumExplicitSGPR + 81 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 82 ST.getTargetID().isXnackOnOrAny()); 83 } 84 85 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 86 const GCNSubtarget &ST) const { 87 if (ST.hasGFX90AInsts() && NumAGPR) 88 return alignTo(NumVGPR, 4) + NumAGPR; 89 return std::max(NumVGPR, NumAGPR); 90 } 91 92 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { 93 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 94 if (!TPC) 95 return false; 96 97 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 98 bool HasIndirectCall = false; 99 100 for (CallGraphNode *I : SCC) { 101 Function *F = I->getFunction(); 102 if (!F || F->isDeclaration()) 103 continue; 104 105 MachineModuleInfo &MMI = 106 getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 107 MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); 108 109 auto CI = CallGraphResourceInfo.insert( 110 std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); 111 SIFunctionResourceInfo &Info = CI.first->second; 112 assert(CI.second && "should only be called once per function"); 113 Info = analyzeResourceUsage(MF, TM); 114 HasIndirectCall |= Info.HasIndirectCall; 115 } 116 117 if (HasIndirectCall) 118 propagateIndirectCallRegisterUsage(); 119 120 return false; 121 } 122 123 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 124 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 125 const MachineFunction &MF, const TargetMachine &TM) const { 126 SIFunctionResourceInfo Info; 127 128 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 129 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 130 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 131 const MachineRegisterInfo &MRI = MF.getRegInfo(); 132 const SIInstrInfo *TII = ST.getInstrInfo(); 133 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 134 135 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 136 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 137 MRI.isLiveIn(MFI->getPreloadedReg( 138 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 139 140 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 141 // instructions aren't used to access the scratch buffer. Inline assembly may 142 // need it though. 143 // 144 // If we only have implicit uses of flat_scr on flat instructions, it is not 145 // really needed. 146 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 147 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 148 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 149 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 150 Info.UsesFlatScratch = false; 151 } 152 153 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 154 155 // Assume a big number if there are any unknown sized objects. 156 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 157 if (Info.HasDynamicallySizedStack) 158 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 159 160 if (MFI->isStackRealigned()) 161 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 162 163 Info.UsesVCC = 164 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 165 166 // If there are no calls, MachineRegisterInfo can tell us the used register 167 // count easily. 168 // A tail call isn't considered a call for MachineFrameInfo's purposes. 169 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 170 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 171 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 172 if (MRI.isPhysRegUsed(Reg)) { 173 HighestVGPRReg = Reg; 174 break; 175 } 176 } 177 178 if (ST.hasMAIInsts()) { 179 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 180 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 181 if (MRI.isPhysRegUsed(Reg)) { 182 HighestAGPRReg = Reg; 183 break; 184 } 185 } 186 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 187 ? 0 188 : TRI.getHWRegIndex(HighestAGPRReg) + 1; 189 } 190 191 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 192 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 193 if (MRI.isPhysRegUsed(Reg)) { 194 HighestSGPRReg = Reg; 195 break; 196 } 197 } 198 199 // We found the maximum register index. They start at 0, so add one to get 200 // the number of registers. 201 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 202 ? 0 203 : TRI.getHWRegIndex(HighestVGPRReg) + 1; 204 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 205 ? 0 206 : TRI.getHWRegIndex(HighestSGPRReg) + 1; 207 208 return Info; 209 } 210 211 int32_t MaxVGPR = -1; 212 int32_t MaxAGPR = -1; 213 int32_t MaxSGPR = -1; 214 uint64_t CalleeFrameSize = 0; 215 216 for (const MachineBasicBlock &MBB : MF) { 217 for (const MachineInstr &MI : MBB) { 218 // TODO: Check regmasks? Do they occur anywhere except calls? 219 for (const MachineOperand &MO : MI.operands()) { 220 unsigned Width = 0; 221 bool IsSGPR = false; 222 bool IsAGPR = false; 223 224 if (!MO.isReg()) 225 continue; 226 227 Register Reg = MO.getReg(); 228 switch (Reg) { 229 case AMDGPU::EXEC: 230 case AMDGPU::EXEC_LO: 231 case AMDGPU::EXEC_HI: 232 case AMDGPU::SCC: 233 case AMDGPU::M0: 234 case AMDGPU::M0_LO16: 235 case AMDGPU::M0_HI16: 236 case AMDGPU::SRC_SHARED_BASE: 237 case AMDGPU::SRC_SHARED_LIMIT: 238 case AMDGPU::SRC_PRIVATE_BASE: 239 case AMDGPU::SRC_PRIVATE_LIMIT: 240 case AMDGPU::SGPR_NULL: 241 case AMDGPU::MODE: 242 continue; 243 244 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 245 llvm_unreachable("src_pops_exiting_wave_id should not be used"); 246 247 case AMDGPU::NoRegister: 248 assert(MI.isDebugInstr() && 249 "Instruction uses invalid noreg register"); 250 continue; 251 252 case AMDGPU::VCC: 253 case AMDGPU::VCC_LO: 254 case AMDGPU::VCC_HI: 255 case AMDGPU::VCC_LO_LO16: 256 case AMDGPU::VCC_LO_HI16: 257 case AMDGPU::VCC_HI_LO16: 258 case AMDGPU::VCC_HI_HI16: 259 Info.UsesVCC = true; 260 continue; 261 262 case AMDGPU::FLAT_SCR: 263 case AMDGPU::FLAT_SCR_LO: 264 case AMDGPU::FLAT_SCR_HI: 265 continue; 266 267 case AMDGPU::XNACK_MASK: 268 case AMDGPU::XNACK_MASK_LO: 269 case AMDGPU::XNACK_MASK_HI: 270 llvm_unreachable("xnack_mask registers should not be used"); 271 272 case AMDGPU::LDS_DIRECT: 273 llvm_unreachable("lds_direct register should not be used"); 274 275 case AMDGPU::TBA: 276 case AMDGPU::TBA_LO: 277 case AMDGPU::TBA_HI: 278 case AMDGPU::TMA: 279 case AMDGPU::TMA_LO: 280 case AMDGPU::TMA_HI: 281 llvm_unreachable("trap handler registers should not be used"); 282 283 case AMDGPU::SRC_VCCZ: 284 llvm_unreachable("src_vccz register should not be used"); 285 286 case AMDGPU::SRC_EXECZ: 287 llvm_unreachable("src_execz register should not be used"); 288 289 case AMDGPU::SRC_SCC: 290 llvm_unreachable("src_scc register should not be used"); 291 292 default: 293 break; 294 } 295 296 if (AMDGPU::SReg_32RegClass.contains(Reg) || 297 AMDGPU::SReg_LO16RegClass.contains(Reg) || 298 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 299 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 300 "trap handler registers should not be used"); 301 IsSGPR = true; 302 Width = 1; 303 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 304 AMDGPU::VGPR_LO16RegClass.contains(Reg) || 305 AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 306 IsSGPR = false; 307 Width = 1; 308 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 309 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 310 IsSGPR = false; 311 IsAGPR = true; 312 Width = 1; 313 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 314 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 315 "trap handler registers should not be used"); 316 IsSGPR = true; 317 Width = 2; 318 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 319 IsSGPR = false; 320 Width = 2; 321 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 322 IsSGPR = false; 323 IsAGPR = true; 324 Width = 2; 325 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 326 IsSGPR = false; 327 Width = 3; 328 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 329 IsSGPR = true; 330 Width = 3; 331 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 332 IsSGPR = false; 333 IsAGPR = true; 334 Width = 3; 335 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 336 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && 337 "trap handler registers should not be used"); 338 IsSGPR = true; 339 Width = 4; 340 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 341 IsSGPR = false; 342 Width = 4; 343 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 344 IsSGPR = false; 345 IsAGPR = true; 346 Width = 4; 347 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 348 IsSGPR = false; 349 Width = 5; 350 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 351 IsSGPR = true; 352 Width = 5; 353 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 354 IsSGPR = false; 355 IsAGPR = true; 356 Width = 5; 357 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 358 IsSGPR = false; 359 Width = 6; 360 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 361 IsSGPR = true; 362 Width = 6; 363 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 364 IsSGPR = false; 365 IsAGPR = true; 366 Width = 6; 367 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 368 IsSGPR = false; 369 Width = 7; 370 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 371 IsSGPR = true; 372 Width = 7; 373 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 374 IsSGPR = false; 375 IsAGPR = true; 376 Width = 7; 377 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 378 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && 379 "trap handler registers should not be used"); 380 IsSGPR = true; 381 Width = 8; 382 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 383 IsSGPR = false; 384 Width = 8; 385 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 386 IsSGPR = false; 387 IsAGPR = true; 388 Width = 8; 389 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 390 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && 391 "trap handler registers should not be used"); 392 IsSGPR = true; 393 Width = 16; 394 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 395 IsSGPR = false; 396 Width = 16; 397 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 398 IsSGPR = false; 399 IsAGPR = true; 400 Width = 16; 401 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 402 IsSGPR = true; 403 Width = 32; 404 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 405 IsSGPR = false; 406 Width = 32; 407 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 408 IsSGPR = false; 409 IsAGPR = true; 410 Width = 32; 411 } else { 412 llvm_unreachable("Unknown register class"); 413 } 414 unsigned HWReg = TRI.getHWRegIndex(Reg); 415 int MaxUsed = HWReg + Width - 1; 416 if (IsSGPR) { 417 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 418 } else if (IsAGPR) { 419 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 420 } else { 421 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 422 } 423 } 424 425 if (MI.isCall()) { 426 // Pseudo used just to encode the underlying global. Is there a better 427 // way to track this? 428 429 const MachineOperand *CalleeOp = 430 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 431 432 const Function *Callee = getCalleeFunction(*CalleeOp); 433 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 434 CallGraphResourceInfo.end(); 435 436 // Avoid crashing on undefined behavior with an illegal call to a 437 // kernel. If a callsite's calling convention doesn't match the 438 // function's, it's undefined behavior. If the callsite calling 439 // convention does match, that would have errored earlier. 440 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 441 report_fatal_error("invalid call to entry function"); 442 443 bool IsIndirect = !Callee || Callee->isDeclaration(); 444 if (!IsIndirect) 445 I = CallGraphResourceInfo.find(Callee); 446 447 if (IsIndirect || I == CallGraphResourceInfo.end()) { 448 CalleeFrameSize = 449 std::max(CalleeFrameSize, 450 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 451 452 // Register usage of indirect calls gets handled later 453 Info.UsesVCC = true; 454 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 455 Info.HasDynamicallySizedStack = true; 456 Info.HasIndirectCall = true; 457 } else { 458 // We force CodeGen to run in SCC order, so the callee's register 459 // usage etc. should be the cumulative usage of all callees. 460 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 461 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 462 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 463 CalleeFrameSize = 464 std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 465 Info.UsesVCC |= I->second.UsesVCC; 466 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 467 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 468 Info.HasRecursion |= I->second.HasRecursion; 469 Info.HasIndirectCall |= I->second.HasIndirectCall; 470 } 471 472 // FIXME: Call site could have norecurse on it 473 if (!Callee || !Callee->doesNotRecurse()) 474 Info.HasRecursion = true; 475 } 476 } 477 } 478 479 Info.NumExplicitSGPR = MaxSGPR + 1; 480 Info.NumVGPR = MaxVGPR + 1; 481 Info.NumAGPR = MaxAGPR + 1; 482 Info.PrivateSegmentSize += CalleeFrameSize; 483 484 return Info; 485 } 486 487 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 488 // Collect the maximum number of registers from non-hardware-entrypoints. 489 // All these functions are potential targets for indirect calls. 490 int32_t NonKernelMaxSGPRs = 0; 491 int32_t NonKernelMaxVGPRs = 0; 492 int32_t NonKernelMaxAGPRs = 0; 493 494 for (const auto &I : CallGraphResourceInfo) { 495 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 496 auto &Info = I.getSecond(); 497 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 498 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 499 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 500 } 501 } 502 503 // Add register usage for functions with indirect calls. 504 // For calls to unknown functions, we assume the maximum register usage of 505 // all non-hardware-entrypoints in the current module. 506 for (auto &I : CallGraphResourceInfo) { 507 auto &Info = I.getSecond(); 508 if (Info.HasIndirectCall) { 509 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 510 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 511 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 512 } 513 } 514 } 515