1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUResourceUsageAnalysis.h" 19 #include "AMDGPU.h" 20 #include "GCNSubtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineModuleInfo.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Target/TargetMachine.h" 27 28 using namespace llvm; 29 using namespace llvm::AMDGPU; 30 31 #define DEBUG_TYPE "amdgpu-resource-usage" 32 33 char llvm::AMDGPUResourceUsageAnalysisWrapperPass::ID = 0; 34 char &llvm::AMDGPUResourceUsageAnalysisID = 35 AMDGPUResourceUsageAnalysisWrapperPass::ID; 36 37 // In code object v4 and older, we need to tell the runtime some amount ahead of 38 // time if we don't know the true stack size. Assume a smaller number if this is 39 // only due to dynamic / non-entry block allocas. 40 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( 41 "amdgpu-assume-external-call-stack-size", 42 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 43 cl::init(16384)); 44 45 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( 46 "amdgpu-assume-dynamic-stack-object-size", 47 cl::desc("Assumed extra stack use if there are any " 48 "variable sized objects (in bytes)"), 49 cl::Hidden, cl::init(4096)); 50 51 INITIALIZE_PASS(AMDGPUResourceUsageAnalysisWrapperPass, DEBUG_TYPE, 52 "Function register usage analysis", true, true) 53 54 static const Function *getCalleeFunction(const MachineOperand &Op) { 55 if (Op.isImm()) { 56 assert(Op.getImm() == 0); 57 return nullptr; 58 } 59 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases()); 60 } 61 62 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 63 const SIInstrInfo &TII, unsigned Reg) { 64 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 65 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 66 return true; 67 } 68 69 return false; 70 } 71 72 bool AMDGPUResourceUsageAnalysisWrapperPass::runOnMachineFunction( 73 MachineFunction &MF) { 74 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 75 if (!TPC) 76 return false; 77 78 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 79 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); 80 81 // By default, for code object v5 and later, track only the minimum scratch 82 // size 83 uint32_t AssumedStackSizeForDynamicSizeObjects = 84 clAssumedStackSizeForDynamicSizeObjects; 85 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; 86 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 87 AMDGPU::AMDHSA_COV5 || 88 STI.getTargetTriple().getOS() == Triple::AMDPAL) { 89 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) 90 AssumedStackSizeForDynamicSizeObjects = 0; 91 if (!clAssumedStackSizeForExternalCall.getNumOccurrences()) 92 AssumedStackSizeForExternalCall = 0; 93 } 94 95 ResourceInfo = AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage( 96 MF, AssumedStackSizeForDynamicSizeObjects, 97 AssumedStackSizeForExternalCall); 98 99 return false; 100 } 101 102 AnalysisKey AMDGPUResourceUsageAnalysis::Key; 103 AMDGPUResourceUsageAnalysis::Result 104 AMDGPUResourceUsageAnalysis::run(MachineFunction &MF, 105 MachineFunctionAnalysisManager &MFAM) { 106 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); 107 108 // By default, for code object v5 and later, track only the minimum scratch 109 // size 110 uint32_t AssumedStackSizeForDynamicSizeObjects = 111 clAssumedStackSizeForDynamicSizeObjects; 112 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; 113 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 114 AMDGPU::AMDHSA_COV5 || 115 STI.getTargetTriple().getOS() == Triple::AMDPAL) { 116 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) 117 AssumedStackSizeForDynamicSizeObjects = 0; 118 if (!clAssumedStackSizeForExternalCall.getNumOccurrences()) 119 AssumedStackSizeForExternalCall = 0; 120 } 121 122 return AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage( 123 MF, AssumedStackSizeForDynamicSizeObjects, 124 AssumedStackSizeForExternalCall); 125 } 126 127 AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo 128 AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( 129 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, 130 uint32_t AssumedStackSizeForExternalCall) const { 131 SIFunctionResourceInfo Info; 132 133 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 134 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 135 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 136 const MachineRegisterInfo &MRI = MF.getRegInfo(); 137 const SIInstrInfo *TII = ST.getInstrInfo(); 138 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 139 140 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 141 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 142 MRI.isLiveIn(MFI->getPreloadedReg( 143 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 144 145 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 146 // instructions aren't used to access the scratch buffer. Inline assembly may 147 // need it though. 148 // 149 // If we only have implicit uses of flat_scr on flat instructions, it is not 150 // really needed. 151 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && 152 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 153 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 154 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 155 Info.UsesFlatScratch = false; 156 } 157 158 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 159 160 // Assume a big number if there are any unknown sized objects. 161 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 162 if (Info.HasDynamicallySizedStack) 163 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 164 165 if (MFI->isStackRealigned()) 166 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 167 168 Info.UsesVCC = 169 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 170 171 // If there are no calls, MachineRegisterInfo can tell us the used register 172 // count easily. 173 // A tail call isn't considered a call for MachineFrameInfo's purposes. 174 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 175 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); 176 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); 177 if (ST.hasMAIInsts()) 178 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); 179 return Info; 180 } 181 182 int32_t MaxVGPR = -1; 183 int32_t MaxAGPR = -1; 184 int32_t MaxSGPR = -1; 185 Info.CalleeSegmentSize = 0; 186 187 for (const MachineBasicBlock &MBB : MF) { 188 for (const MachineInstr &MI : MBB) { 189 // TODO: Check regmasks? Do they occur anywhere except calls? 190 for (const MachineOperand &MO : MI.operands()) { 191 unsigned Width = 0; 192 bool IsSGPR = false; 193 bool IsAGPR = false; 194 195 if (!MO.isReg()) 196 continue; 197 198 Register Reg = MO.getReg(); 199 switch (Reg) { 200 case AMDGPU::EXEC: 201 case AMDGPU::EXEC_LO: 202 case AMDGPU::EXEC_HI: 203 case AMDGPU::SCC: 204 case AMDGPU::M0: 205 case AMDGPU::M0_LO16: 206 case AMDGPU::M0_HI16: 207 case AMDGPU::SRC_SHARED_BASE_LO: 208 case AMDGPU::SRC_SHARED_BASE: 209 case AMDGPU::SRC_SHARED_LIMIT_LO: 210 case AMDGPU::SRC_SHARED_LIMIT: 211 case AMDGPU::SRC_PRIVATE_BASE_LO: 212 case AMDGPU::SRC_PRIVATE_BASE: 213 case AMDGPU::SRC_PRIVATE_LIMIT_LO: 214 case AMDGPU::SRC_PRIVATE_LIMIT: 215 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 216 case AMDGPU::SGPR_NULL: 217 case AMDGPU::SGPR_NULL64: 218 case AMDGPU::MODE: 219 continue; 220 221 case AMDGPU::NoRegister: 222 assert(MI.isDebugInstr() && 223 "Instruction uses invalid noreg register"); 224 continue; 225 226 case AMDGPU::VCC: 227 case AMDGPU::VCC_LO: 228 case AMDGPU::VCC_HI: 229 case AMDGPU::VCC_LO_LO16: 230 case AMDGPU::VCC_LO_HI16: 231 case AMDGPU::VCC_HI_LO16: 232 case AMDGPU::VCC_HI_HI16: 233 Info.UsesVCC = true; 234 continue; 235 236 case AMDGPU::FLAT_SCR: 237 case AMDGPU::FLAT_SCR_LO: 238 case AMDGPU::FLAT_SCR_HI: 239 continue; 240 241 case AMDGPU::XNACK_MASK: 242 case AMDGPU::XNACK_MASK_LO: 243 case AMDGPU::XNACK_MASK_HI: 244 llvm_unreachable("xnack_mask registers should not be used"); 245 246 case AMDGPU::LDS_DIRECT: 247 llvm_unreachable("lds_direct register should not be used"); 248 249 case AMDGPU::TBA: 250 case AMDGPU::TBA_LO: 251 case AMDGPU::TBA_HI: 252 case AMDGPU::TMA: 253 case AMDGPU::TMA_LO: 254 case AMDGPU::TMA_HI: 255 llvm_unreachable("trap handler registers should not be used"); 256 257 case AMDGPU::SRC_VCCZ: 258 llvm_unreachable("src_vccz register should not be used"); 259 260 case AMDGPU::SRC_EXECZ: 261 llvm_unreachable("src_execz register should not be used"); 262 263 case AMDGPU::SRC_SCC: 264 llvm_unreachable("src_scc register should not be used"); 265 266 default: 267 break; 268 } 269 270 if (AMDGPU::SGPR_32RegClass.contains(Reg) || 271 AMDGPU::SGPR_LO16RegClass.contains(Reg) || 272 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 273 IsSGPR = true; 274 Width = 1; 275 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 276 AMDGPU::VGPR_16RegClass.contains(Reg)) { 277 IsSGPR = false; 278 Width = 1; 279 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 280 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 281 IsSGPR = false; 282 IsAGPR = true; 283 Width = 1; 284 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { 285 IsSGPR = true; 286 Width = 2; 287 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 288 IsSGPR = false; 289 Width = 2; 290 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 291 IsSGPR = false; 292 IsAGPR = true; 293 Width = 2; 294 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 295 IsSGPR = false; 296 Width = 3; 297 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 298 IsSGPR = true; 299 Width = 3; 300 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 301 IsSGPR = false; 302 IsAGPR = true; 303 Width = 3; 304 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { 305 IsSGPR = true; 306 Width = 4; 307 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 308 IsSGPR = false; 309 Width = 4; 310 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 311 IsSGPR = false; 312 IsAGPR = true; 313 Width = 4; 314 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 315 IsSGPR = false; 316 Width = 5; 317 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 318 IsSGPR = true; 319 Width = 5; 320 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 321 IsSGPR = false; 322 IsAGPR = true; 323 Width = 5; 324 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 325 IsSGPR = false; 326 Width = 6; 327 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 328 IsSGPR = true; 329 Width = 6; 330 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 331 IsSGPR = false; 332 IsAGPR = true; 333 Width = 6; 334 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 335 IsSGPR = false; 336 Width = 7; 337 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 338 IsSGPR = true; 339 Width = 7; 340 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 341 IsSGPR = false; 342 IsAGPR = true; 343 Width = 7; 344 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 345 IsSGPR = true; 346 Width = 8; 347 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 348 IsSGPR = false; 349 Width = 8; 350 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 351 IsSGPR = false; 352 IsAGPR = true; 353 Width = 8; 354 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { 355 IsSGPR = false; 356 Width = 9; 357 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { 358 IsSGPR = true; 359 Width = 9; 360 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { 361 IsSGPR = false; 362 IsAGPR = true; 363 Width = 9; 364 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { 365 IsSGPR = false; 366 Width = 10; 367 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { 368 IsSGPR = true; 369 Width = 10; 370 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { 371 IsSGPR = false; 372 IsAGPR = true; 373 Width = 10; 374 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { 375 IsSGPR = false; 376 Width = 11; 377 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { 378 IsSGPR = true; 379 Width = 11; 380 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { 381 IsSGPR = false; 382 IsAGPR = true; 383 Width = 11; 384 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { 385 IsSGPR = false; 386 Width = 12; 387 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { 388 IsSGPR = true; 389 Width = 12; 390 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { 391 IsSGPR = false; 392 IsAGPR = true; 393 Width = 12; 394 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 395 IsSGPR = true; 396 Width = 16; 397 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 398 IsSGPR = false; 399 Width = 16; 400 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 401 IsSGPR = false; 402 IsAGPR = true; 403 Width = 16; 404 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 405 IsSGPR = true; 406 Width = 32; 407 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 408 IsSGPR = false; 409 Width = 32; 410 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 411 IsSGPR = false; 412 IsAGPR = true; 413 Width = 32; 414 } else { 415 // We only expect TTMP registers or registers that do not belong to 416 // any RC. 417 assert((AMDGPU::TTMP_32RegClass.contains(Reg) || 418 AMDGPU::TTMP_64RegClass.contains(Reg) || 419 AMDGPU::TTMP_128RegClass.contains(Reg) || 420 AMDGPU::TTMP_256RegClass.contains(Reg) || 421 AMDGPU::TTMP_512RegClass.contains(Reg) || 422 !TRI.getPhysRegBaseClass(Reg)) && 423 "Unknown register class"); 424 } 425 unsigned HWReg = TRI.getHWRegIndex(Reg); 426 int MaxUsed = HWReg + Width - 1; 427 if (IsSGPR) { 428 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 429 } else if (IsAGPR) { 430 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 431 } else { 432 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 433 } 434 } 435 436 if (MI.isCall()) { 437 // Pseudo used just to encode the underlying global. Is there a better 438 // way to track this? 439 440 const MachineOperand *CalleeOp = 441 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 442 443 const Function *Callee = getCalleeFunction(*CalleeOp); 444 445 // Avoid crashing on undefined behavior with an illegal call to a 446 // kernel. If a callsite's calling convention doesn't match the 447 // function's, it's undefined behavior. If the callsite calling 448 // convention does match, that would have errored earlier. 449 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 450 report_fatal_error("invalid call to entry function"); 451 452 auto isSameFunction = [](const MachineFunction &MF, const Function *F) { 453 return F == &MF.getFunction(); 454 }; 455 456 if (Callee && !isSameFunction(MF, Callee)) 457 Info.Callees.push_back(Callee); 458 459 bool IsIndirect = !Callee || Callee->isDeclaration(); 460 461 // FIXME: Call site could have norecurse on it 462 if (!Callee || !Callee->doesNotRecurse()) { 463 Info.HasRecursion = true; 464 465 // TODO: If we happen to know there is no stack usage in the 466 // callgraph, we don't need to assume an infinitely growing stack. 467 if (!MI.isReturn()) { 468 // We don't need to assume an unknown stack size for tail calls. 469 470 // FIXME: This only benefits in the case where the kernel does not 471 // directly call the tail called function. If a kernel directly 472 // calls a tail recursive function, we'll assume maximum stack size 473 // based on the regular call instruction. 474 Info.CalleeSegmentSize = std::max( 475 Info.CalleeSegmentSize, 476 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 477 } 478 } 479 480 if (IsIndirect) { 481 Info.CalleeSegmentSize = 482 std::max(Info.CalleeSegmentSize, 483 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 484 485 // Register usage of indirect calls gets handled later 486 Info.UsesVCC = true; 487 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 488 Info.HasDynamicallySizedStack = true; 489 Info.HasIndirectCall = true; 490 } 491 } 492 } 493 } 494 495 Info.NumExplicitSGPR = MaxSGPR + 1; 496 Info.NumVGPR = MaxVGPR + 1; 497 Info.NumAGPR = MaxAGPR + 1; 498 499 return Info; 500 } 501