1*fe6060f1SDimitry Andric //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2*fe6060f1SDimitry Andric // 3*fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*fe6060f1SDimitry Andric // 7*fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 8*fe6060f1SDimitry Andric // 9*fe6060f1SDimitry Andric /// \file 10*fe6060f1SDimitry Andric /// \brief Analyzes how many registers and other resources are used by 11*fe6060f1SDimitry Andric /// functions. 12*fe6060f1SDimitry Andric /// 13*fe6060f1SDimitry Andric /// The results of this analysis are used to fill the register usage, flat 14*fe6060f1SDimitry Andric /// usage, etc. into hardware registers. 15*fe6060f1SDimitry Andric /// 16*fe6060f1SDimitry Andric /// The analysis takes callees into account. E.g. if a function A that needs 10 17*fe6060f1SDimitry Andric /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18*fe6060f1SDimitry Andric /// will return 20. 19*fe6060f1SDimitry Andric /// It is assumed that an indirect call can go into any function except 20*fe6060f1SDimitry Andric /// hardware-entrypoints. Therefore the register usage of functions with 21*fe6060f1SDimitry Andric /// indirect calls is estimated as the maximum of all non-entrypoint functions 22*fe6060f1SDimitry Andric /// in the module. 23*fe6060f1SDimitry Andric /// 24*fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 25*fe6060f1SDimitry Andric 26*fe6060f1SDimitry Andric #include "AMDGPUResourceUsageAnalysis.h" 27*fe6060f1SDimitry Andric #include "AMDGPU.h" 28*fe6060f1SDimitry Andric #include "GCNSubtarget.h" 29*fe6060f1SDimitry Andric #include "SIMachineFunctionInfo.h" 30*fe6060f1SDimitry Andric #include "llvm/Analysis/CallGraph.h" 31*fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 32*fe6060f1SDimitry Andric #include "llvm/Target/TargetMachine.h" 33*fe6060f1SDimitry Andric 34*fe6060f1SDimitry Andric using namespace llvm; 35*fe6060f1SDimitry Andric using namespace llvm::AMDGPU; 36*fe6060f1SDimitry Andric 37*fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-resource-usage" 38*fe6060f1SDimitry Andric 39*fe6060f1SDimitry Andric char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 40*fe6060f1SDimitry Andric char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 41*fe6060f1SDimitry Andric 42*fe6060f1SDimitry Andric // We need to tell the runtime some amount ahead of time if we don't know the 43*fe6060f1SDimitry Andric // true stack size. Assume a smaller number if this is only due to dynamic / 44*fe6060f1SDimitry Andric // non-entry block allocas. 45*fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 46*fe6060f1SDimitry Andric "amdgpu-assume-external-call-stack-size", 47*fe6060f1SDimitry Andric cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 48*fe6060f1SDimitry Andric cl::init(16384)); 49*fe6060f1SDimitry Andric 50*fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 51*fe6060f1SDimitry Andric "amdgpu-assume-dynamic-stack-object-size", 52*fe6060f1SDimitry Andric cl::desc("Assumed extra stack use if there are any " 53*fe6060f1SDimitry Andric "variable sized objects (in bytes)"), 54*fe6060f1SDimitry Andric cl::Hidden, cl::init(4096)); 55*fe6060f1SDimitry Andric 56*fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 57*fe6060f1SDimitry Andric "Function register usage analysis", true, true) 58*fe6060f1SDimitry Andric 59*fe6060f1SDimitry Andric static const Function *getCalleeFunction(const MachineOperand &Op) { 60*fe6060f1SDimitry Andric if (Op.isImm()) { 61*fe6060f1SDimitry Andric assert(Op.getImm() == 0); 62*fe6060f1SDimitry Andric return nullptr; 63*fe6060f1SDimitry Andric } 64*fe6060f1SDimitry Andric 65*fe6060f1SDimitry Andric return cast<Function>(Op.getGlobal()); 66*fe6060f1SDimitry Andric } 67*fe6060f1SDimitry Andric 68*fe6060f1SDimitry Andric static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 69*fe6060f1SDimitry Andric const SIInstrInfo &TII, unsigned Reg) { 70*fe6060f1SDimitry Andric for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 71*fe6060f1SDimitry Andric if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 72*fe6060f1SDimitry Andric return true; 73*fe6060f1SDimitry Andric } 74*fe6060f1SDimitry Andric 75*fe6060f1SDimitry Andric return false; 76*fe6060f1SDimitry Andric } 77*fe6060f1SDimitry Andric 78*fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 79*fe6060f1SDimitry Andric const GCNSubtarget &ST) const { 80*fe6060f1SDimitry Andric return NumExplicitSGPR + 81*fe6060f1SDimitry Andric IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 82*fe6060f1SDimitry Andric ST.getTargetID().isXnackOnOrAny()); 83*fe6060f1SDimitry Andric } 84*fe6060f1SDimitry Andric 85*fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 86*fe6060f1SDimitry Andric const GCNSubtarget &ST) const { 87*fe6060f1SDimitry Andric if (ST.hasGFX90AInsts() && NumAGPR) 88*fe6060f1SDimitry Andric return alignTo(NumVGPR, 4) + NumAGPR; 89*fe6060f1SDimitry Andric return std::max(NumVGPR, NumAGPR); 90*fe6060f1SDimitry Andric } 91*fe6060f1SDimitry Andric 92*fe6060f1SDimitry Andric bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { 93*fe6060f1SDimitry Andric auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 94*fe6060f1SDimitry Andric if (!TPC) 95*fe6060f1SDimitry Andric return false; 96*fe6060f1SDimitry Andric 97*fe6060f1SDimitry Andric const TargetMachine &TM = TPC->getTM<TargetMachine>(); 98*fe6060f1SDimitry Andric bool HasIndirectCall = false; 99*fe6060f1SDimitry Andric 100*fe6060f1SDimitry Andric for (CallGraphNode *I : SCC) { 101*fe6060f1SDimitry Andric Function *F = I->getFunction(); 102*fe6060f1SDimitry Andric if (!F || F->isDeclaration()) 103*fe6060f1SDimitry Andric continue; 104*fe6060f1SDimitry Andric 105*fe6060f1SDimitry Andric MachineModuleInfo &MMI = 106*fe6060f1SDimitry Andric getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 107*fe6060f1SDimitry Andric MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); 108*fe6060f1SDimitry Andric 109*fe6060f1SDimitry Andric auto CI = CallGraphResourceInfo.insert( 110*fe6060f1SDimitry Andric std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); 111*fe6060f1SDimitry Andric SIFunctionResourceInfo &Info = CI.first->second; 112*fe6060f1SDimitry Andric assert(CI.second && "should only be called once per function"); 113*fe6060f1SDimitry Andric Info = analyzeResourceUsage(MF, TM); 114*fe6060f1SDimitry Andric HasIndirectCall |= Info.HasIndirectCall; 115*fe6060f1SDimitry Andric } 116*fe6060f1SDimitry Andric 117*fe6060f1SDimitry Andric if (HasIndirectCall) 118*fe6060f1SDimitry Andric propagateIndirectCallRegisterUsage(); 119*fe6060f1SDimitry Andric 120*fe6060f1SDimitry Andric return false; 121*fe6060f1SDimitry Andric } 122*fe6060f1SDimitry Andric 123*fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 124*fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 125*fe6060f1SDimitry Andric const MachineFunction &MF, const TargetMachine &TM) const { 126*fe6060f1SDimitry Andric SIFunctionResourceInfo Info; 127*fe6060f1SDimitry Andric 128*fe6060f1SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 129*fe6060f1SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 130*fe6060f1SDimitry Andric const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 131*fe6060f1SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo(); 132*fe6060f1SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo(); 133*fe6060f1SDimitry Andric const SIRegisterInfo &TRI = TII->getRegisterInfo(); 134*fe6060f1SDimitry Andric 135*fe6060f1SDimitry Andric Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 136*fe6060f1SDimitry Andric MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 137*fe6060f1SDimitry Andric MRI.isLiveIn(MFI->getPreloadedReg( 138*fe6060f1SDimitry Andric AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 139*fe6060f1SDimitry Andric 140*fe6060f1SDimitry Andric // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 141*fe6060f1SDimitry Andric // instructions aren't used to access the scratch buffer. Inline assembly may 142*fe6060f1SDimitry Andric // need it though. 143*fe6060f1SDimitry Andric // 144*fe6060f1SDimitry Andric // If we only have implicit uses of flat_scr on flat instructions, it is not 145*fe6060f1SDimitry Andric // really needed. 146*fe6060f1SDimitry Andric if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 147*fe6060f1SDimitry Andric (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 148*fe6060f1SDimitry Andric !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 149*fe6060f1SDimitry Andric !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 150*fe6060f1SDimitry Andric Info.UsesFlatScratch = false; 151*fe6060f1SDimitry Andric } 152*fe6060f1SDimitry Andric 153*fe6060f1SDimitry Andric Info.PrivateSegmentSize = FrameInfo.getStackSize(); 154*fe6060f1SDimitry Andric 155*fe6060f1SDimitry Andric // Assume a big number if there are any unknown sized objects. 156*fe6060f1SDimitry Andric Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 157*fe6060f1SDimitry Andric if (Info.HasDynamicallySizedStack) 158*fe6060f1SDimitry Andric Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 159*fe6060f1SDimitry Andric 160*fe6060f1SDimitry Andric if (MFI->isStackRealigned()) 161*fe6060f1SDimitry Andric Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 162*fe6060f1SDimitry Andric 163*fe6060f1SDimitry Andric Info.UsesVCC = 164*fe6060f1SDimitry Andric MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 165*fe6060f1SDimitry Andric 166*fe6060f1SDimitry Andric // If there are no calls, MachineRegisterInfo can tell us the used register 167*fe6060f1SDimitry Andric // count easily. 168*fe6060f1SDimitry Andric // A tail call isn't considered a call for MachineFrameInfo's purposes. 169*fe6060f1SDimitry Andric if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 170*fe6060f1SDimitry Andric MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 171*fe6060f1SDimitry Andric for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 172*fe6060f1SDimitry Andric if (MRI.isPhysRegUsed(Reg)) { 173*fe6060f1SDimitry Andric HighestVGPRReg = Reg; 174*fe6060f1SDimitry Andric break; 175*fe6060f1SDimitry Andric } 176*fe6060f1SDimitry Andric } 177*fe6060f1SDimitry Andric 178*fe6060f1SDimitry Andric if (ST.hasMAIInsts()) { 179*fe6060f1SDimitry Andric MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 180*fe6060f1SDimitry Andric for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 181*fe6060f1SDimitry Andric if (MRI.isPhysRegUsed(Reg)) { 182*fe6060f1SDimitry Andric HighestAGPRReg = Reg; 183*fe6060f1SDimitry Andric break; 184*fe6060f1SDimitry Andric } 185*fe6060f1SDimitry Andric } 186*fe6060f1SDimitry Andric Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 187*fe6060f1SDimitry Andric ? 0 188*fe6060f1SDimitry Andric : TRI.getHWRegIndex(HighestAGPRReg) + 1; 189*fe6060f1SDimitry Andric } 190*fe6060f1SDimitry Andric 191*fe6060f1SDimitry Andric MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 192*fe6060f1SDimitry Andric for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 193*fe6060f1SDimitry Andric if (MRI.isPhysRegUsed(Reg)) { 194*fe6060f1SDimitry Andric HighestSGPRReg = Reg; 195*fe6060f1SDimitry Andric break; 196*fe6060f1SDimitry Andric } 197*fe6060f1SDimitry Andric } 198*fe6060f1SDimitry Andric 199*fe6060f1SDimitry Andric // We found the maximum register index. They start at 0, so add one to get 200*fe6060f1SDimitry Andric // the number of registers. 201*fe6060f1SDimitry Andric Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 202*fe6060f1SDimitry Andric ? 0 203*fe6060f1SDimitry Andric : TRI.getHWRegIndex(HighestVGPRReg) + 1; 204*fe6060f1SDimitry Andric Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 205*fe6060f1SDimitry Andric ? 0 206*fe6060f1SDimitry Andric : TRI.getHWRegIndex(HighestSGPRReg) + 1; 207*fe6060f1SDimitry Andric 208*fe6060f1SDimitry Andric return Info; 209*fe6060f1SDimitry Andric } 210*fe6060f1SDimitry Andric 211*fe6060f1SDimitry Andric int32_t MaxVGPR = -1; 212*fe6060f1SDimitry Andric int32_t MaxAGPR = -1; 213*fe6060f1SDimitry Andric int32_t MaxSGPR = -1; 214*fe6060f1SDimitry Andric uint64_t CalleeFrameSize = 0; 215*fe6060f1SDimitry Andric 216*fe6060f1SDimitry Andric for (const MachineBasicBlock &MBB : MF) { 217*fe6060f1SDimitry Andric for (const MachineInstr &MI : MBB) { 218*fe6060f1SDimitry Andric // TODO: Check regmasks? Do they occur anywhere except calls? 219*fe6060f1SDimitry Andric for (const MachineOperand &MO : MI.operands()) { 220*fe6060f1SDimitry Andric unsigned Width = 0; 221*fe6060f1SDimitry Andric bool IsSGPR = false; 222*fe6060f1SDimitry Andric bool IsAGPR = false; 223*fe6060f1SDimitry Andric 224*fe6060f1SDimitry Andric if (!MO.isReg()) 225*fe6060f1SDimitry Andric continue; 226*fe6060f1SDimitry Andric 227*fe6060f1SDimitry Andric Register Reg = MO.getReg(); 228*fe6060f1SDimitry Andric switch (Reg) { 229*fe6060f1SDimitry Andric case AMDGPU::EXEC: 230*fe6060f1SDimitry Andric case AMDGPU::EXEC_LO: 231*fe6060f1SDimitry Andric case AMDGPU::EXEC_HI: 232*fe6060f1SDimitry Andric case AMDGPU::SCC: 233*fe6060f1SDimitry Andric case AMDGPU::M0: 234*fe6060f1SDimitry Andric case AMDGPU::M0_LO16: 235*fe6060f1SDimitry Andric case AMDGPU::M0_HI16: 236*fe6060f1SDimitry Andric case AMDGPU::SRC_SHARED_BASE: 237*fe6060f1SDimitry Andric case AMDGPU::SRC_SHARED_LIMIT: 238*fe6060f1SDimitry Andric case AMDGPU::SRC_PRIVATE_BASE: 239*fe6060f1SDimitry Andric case AMDGPU::SRC_PRIVATE_LIMIT: 240*fe6060f1SDimitry Andric case AMDGPU::SGPR_NULL: 241*fe6060f1SDimitry Andric case AMDGPU::MODE: 242*fe6060f1SDimitry Andric continue; 243*fe6060f1SDimitry Andric 244*fe6060f1SDimitry Andric case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 245*fe6060f1SDimitry Andric llvm_unreachable("src_pops_exiting_wave_id should not be used"); 246*fe6060f1SDimitry Andric 247*fe6060f1SDimitry Andric case AMDGPU::NoRegister: 248*fe6060f1SDimitry Andric assert(MI.isDebugInstr() && 249*fe6060f1SDimitry Andric "Instruction uses invalid noreg register"); 250*fe6060f1SDimitry Andric continue; 251*fe6060f1SDimitry Andric 252*fe6060f1SDimitry Andric case AMDGPU::VCC: 253*fe6060f1SDimitry Andric case AMDGPU::VCC_LO: 254*fe6060f1SDimitry Andric case AMDGPU::VCC_HI: 255*fe6060f1SDimitry Andric case AMDGPU::VCC_LO_LO16: 256*fe6060f1SDimitry Andric case AMDGPU::VCC_LO_HI16: 257*fe6060f1SDimitry Andric case AMDGPU::VCC_HI_LO16: 258*fe6060f1SDimitry Andric case AMDGPU::VCC_HI_HI16: 259*fe6060f1SDimitry Andric Info.UsesVCC = true; 260*fe6060f1SDimitry Andric continue; 261*fe6060f1SDimitry Andric 262*fe6060f1SDimitry Andric case AMDGPU::FLAT_SCR: 263*fe6060f1SDimitry Andric case AMDGPU::FLAT_SCR_LO: 264*fe6060f1SDimitry Andric case AMDGPU::FLAT_SCR_HI: 265*fe6060f1SDimitry Andric continue; 266*fe6060f1SDimitry Andric 267*fe6060f1SDimitry Andric case AMDGPU::XNACK_MASK: 268*fe6060f1SDimitry Andric case AMDGPU::XNACK_MASK_LO: 269*fe6060f1SDimitry Andric case AMDGPU::XNACK_MASK_HI: 270*fe6060f1SDimitry Andric llvm_unreachable("xnack_mask registers should not be used"); 271*fe6060f1SDimitry Andric 272*fe6060f1SDimitry Andric case AMDGPU::LDS_DIRECT: 273*fe6060f1SDimitry Andric llvm_unreachable("lds_direct register should not be used"); 274*fe6060f1SDimitry Andric 275*fe6060f1SDimitry Andric case AMDGPU::TBA: 276*fe6060f1SDimitry Andric case AMDGPU::TBA_LO: 277*fe6060f1SDimitry Andric case AMDGPU::TBA_HI: 278*fe6060f1SDimitry Andric case AMDGPU::TMA: 279*fe6060f1SDimitry Andric case AMDGPU::TMA_LO: 280*fe6060f1SDimitry Andric case AMDGPU::TMA_HI: 281*fe6060f1SDimitry Andric llvm_unreachable("trap handler registers should not be used"); 282*fe6060f1SDimitry Andric 283*fe6060f1SDimitry Andric case AMDGPU::SRC_VCCZ: 284*fe6060f1SDimitry Andric llvm_unreachable("src_vccz register should not be used"); 285*fe6060f1SDimitry Andric 286*fe6060f1SDimitry Andric case AMDGPU::SRC_EXECZ: 287*fe6060f1SDimitry Andric llvm_unreachable("src_execz register should not be used"); 288*fe6060f1SDimitry Andric 289*fe6060f1SDimitry Andric case AMDGPU::SRC_SCC: 290*fe6060f1SDimitry Andric llvm_unreachable("src_scc register should not be used"); 291*fe6060f1SDimitry Andric 292*fe6060f1SDimitry Andric default: 293*fe6060f1SDimitry Andric break; 294*fe6060f1SDimitry Andric } 295*fe6060f1SDimitry Andric 296*fe6060f1SDimitry Andric if (AMDGPU::SReg_32RegClass.contains(Reg) || 297*fe6060f1SDimitry Andric AMDGPU::SReg_LO16RegClass.contains(Reg) || 298*fe6060f1SDimitry Andric AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 299*fe6060f1SDimitry Andric assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 300*fe6060f1SDimitry Andric "trap handler registers should not be used"); 301*fe6060f1SDimitry Andric IsSGPR = true; 302*fe6060f1SDimitry Andric Width = 1; 303*fe6060f1SDimitry Andric } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 304*fe6060f1SDimitry Andric AMDGPU::VGPR_LO16RegClass.contains(Reg) || 305*fe6060f1SDimitry Andric AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 306*fe6060f1SDimitry Andric IsSGPR = false; 307*fe6060f1SDimitry Andric Width = 1; 308*fe6060f1SDimitry Andric } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 309*fe6060f1SDimitry Andric AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 310*fe6060f1SDimitry Andric IsSGPR = false; 311*fe6060f1SDimitry Andric IsAGPR = true; 312*fe6060f1SDimitry Andric Width = 1; 313*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 314*fe6060f1SDimitry Andric assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 315*fe6060f1SDimitry Andric "trap handler registers should not be used"); 316*fe6060f1SDimitry Andric IsSGPR = true; 317*fe6060f1SDimitry Andric Width = 2; 318*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 319*fe6060f1SDimitry Andric IsSGPR = false; 320*fe6060f1SDimitry Andric Width = 2; 321*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 322*fe6060f1SDimitry Andric IsSGPR = false; 323*fe6060f1SDimitry Andric IsAGPR = true; 324*fe6060f1SDimitry Andric Width = 2; 325*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 326*fe6060f1SDimitry Andric IsSGPR = false; 327*fe6060f1SDimitry Andric Width = 3; 328*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 329*fe6060f1SDimitry Andric IsSGPR = true; 330*fe6060f1SDimitry Andric Width = 3; 331*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 332*fe6060f1SDimitry Andric IsSGPR = false; 333*fe6060f1SDimitry Andric IsAGPR = true; 334*fe6060f1SDimitry Andric Width = 3; 335*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 336*fe6060f1SDimitry Andric assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && 337*fe6060f1SDimitry Andric "trap handler registers should not be used"); 338*fe6060f1SDimitry Andric IsSGPR = true; 339*fe6060f1SDimitry Andric Width = 4; 340*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 341*fe6060f1SDimitry Andric IsSGPR = false; 342*fe6060f1SDimitry Andric Width = 4; 343*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 344*fe6060f1SDimitry Andric IsSGPR = false; 345*fe6060f1SDimitry Andric IsAGPR = true; 346*fe6060f1SDimitry Andric Width = 4; 347*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 348*fe6060f1SDimitry Andric IsSGPR = false; 349*fe6060f1SDimitry Andric Width = 5; 350*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 351*fe6060f1SDimitry Andric IsSGPR = true; 352*fe6060f1SDimitry Andric Width = 5; 353*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 354*fe6060f1SDimitry Andric IsSGPR = false; 355*fe6060f1SDimitry Andric IsAGPR = true; 356*fe6060f1SDimitry Andric Width = 5; 357*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 358*fe6060f1SDimitry Andric IsSGPR = false; 359*fe6060f1SDimitry Andric Width = 6; 360*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 361*fe6060f1SDimitry Andric IsSGPR = true; 362*fe6060f1SDimitry Andric Width = 6; 363*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 364*fe6060f1SDimitry Andric IsSGPR = false; 365*fe6060f1SDimitry Andric IsAGPR = true; 366*fe6060f1SDimitry Andric Width = 6; 367*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 368*fe6060f1SDimitry Andric IsSGPR = false; 369*fe6060f1SDimitry Andric Width = 7; 370*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 371*fe6060f1SDimitry Andric IsSGPR = true; 372*fe6060f1SDimitry Andric Width = 7; 373*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 374*fe6060f1SDimitry Andric IsSGPR = false; 375*fe6060f1SDimitry Andric IsAGPR = true; 376*fe6060f1SDimitry Andric Width = 7; 377*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 378*fe6060f1SDimitry Andric assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && 379*fe6060f1SDimitry Andric "trap handler registers should not be used"); 380*fe6060f1SDimitry Andric IsSGPR = true; 381*fe6060f1SDimitry Andric Width = 8; 382*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 383*fe6060f1SDimitry Andric IsSGPR = false; 384*fe6060f1SDimitry Andric Width = 8; 385*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 386*fe6060f1SDimitry Andric IsSGPR = false; 387*fe6060f1SDimitry Andric IsAGPR = true; 388*fe6060f1SDimitry Andric Width = 8; 389*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 390*fe6060f1SDimitry Andric assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && 391*fe6060f1SDimitry Andric "trap handler registers should not be used"); 392*fe6060f1SDimitry Andric IsSGPR = true; 393*fe6060f1SDimitry Andric Width = 16; 394*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 395*fe6060f1SDimitry Andric IsSGPR = false; 396*fe6060f1SDimitry Andric Width = 16; 397*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 398*fe6060f1SDimitry Andric IsSGPR = false; 399*fe6060f1SDimitry Andric IsAGPR = true; 400*fe6060f1SDimitry Andric Width = 16; 401*fe6060f1SDimitry Andric } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 402*fe6060f1SDimitry Andric IsSGPR = true; 403*fe6060f1SDimitry Andric Width = 32; 404*fe6060f1SDimitry Andric } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 405*fe6060f1SDimitry Andric IsSGPR = false; 406*fe6060f1SDimitry Andric Width = 32; 407*fe6060f1SDimitry Andric } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 408*fe6060f1SDimitry Andric IsSGPR = false; 409*fe6060f1SDimitry Andric IsAGPR = true; 410*fe6060f1SDimitry Andric Width = 32; 411*fe6060f1SDimitry Andric } else { 412*fe6060f1SDimitry Andric llvm_unreachable("Unknown register class"); 413*fe6060f1SDimitry Andric } 414*fe6060f1SDimitry Andric unsigned HWReg = TRI.getHWRegIndex(Reg); 415*fe6060f1SDimitry Andric int MaxUsed = HWReg + Width - 1; 416*fe6060f1SDimitry Andric if (IsSGPR) { 417*fe6060f1SDimitry Andric MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 418*fe6060f1SDimitry Andric } else if (IsAGPR) { 419*fe6060f1SDimitry Andric MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 420*fe6060f1SDimitry Andric } else { 421*fe6060f1SDimitry Andric MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 422*fe6060f1SDimitry Andric } 423*fe6060f1SDimitry Andric } 424*fe6060f1SDimitry Andric 425*fe6060f1SDimitry Andric if (MI.isCall()) { 426*fe6060f1SDimitry Andric // Pseudo used just to encode the underlying global. Is there a better 427*fe6060f1SDimitry Andric // way to track this? 428*fe6060f1SDimitry Andric 429*fe6060f1SDimitry Andric const MachineOperand *CalleeOp = 430*fe6060f1SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::callee); 431*fe6060f1SDimitry Andric 432*fe6060f1SDimitry Andric const Function *Callee = getCalleeFunction(*CalleeOp); 433*fe6060f1SDimitry Andric DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 434*fe6060f1SDimitry Andric CallGraphResourceInfo.end(); 435*fe6060f1SDimitry Andric 436*fe6060f1SDimitry Andric // Avoid crashing on undefined behavior with an illegal call to a 437*fe6060f1SDimitry Andric // kernel. If a callsite's calling convention doesn't match the 438*fe6060f1SDimitry Andric // function's, it's undefined behavior. If the callsite calling 439*fe6060f1SDimitry Andric // convention does match, that would have errored earlier. 440*fe6060f1SDimitry Andric if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 441*fe6060f1SDimitry Andric report_fatal_error("invalid call to entry function"); 442*fe6060f1SDimitry Andric 443*fe6060f1SDimitry Andric bool IsIndirect = !Callee || Callee->isDeclaration(); 444*fe6060f1SDimitry Andric if (!IsIndirect) 445*fe6060f1SDimitry Andric I = CallGraphResourceInfo.find(Callee); 446*fe6060f1SDimitry Andric 447*fe6060f1SDimitry Andric if (IsIndirect || I == CallGraphResourceInfo.end()) { 448*fe6060f1SDimitry Andric CalleeFrameSize = 449*fe6060f1SDimitry Andric std::max(CalleeFrameSize, 450*fe6060f1SDimitry Andric static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 451*fe6060f1SDimitry Andric 452*fe6060f1SDimitry Andric // Register usage of indirect calls gets handled later 453*fe6060f1SDimitry Andric Info.UsesVCC = true; 454*fe6060f1SDimitry Andric Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 455*fe6060f1SDimitry Andric Info.HasDynamicallySizedStack = true; 456*fe6060f1SDimitry Andric Info.HasIndirectCall = true; 457*fe6060f1SDimitry Andric } else { 458*fe6060f1SDimitry Andric // We force CodeGen to run in SCC order, so the callee's register 459*fe6060f1SDimitry Andric // usage etc. should be the cumulative usage of all callees. 460*fe6060f1SDimitry Andric MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 461*fe6060f1SDimitry Andric MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 462*fe6060f1SDimitry Andric MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 463*fe6060f1SDimitry Andric CalleeFrameSize = 464*fe6060f1SDimitry Andric std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 465*fe6060f1SDimitry Andric Info.UsesVCC |= I->second.UsesVCC; 466*fe6060f1SDimitry Andric Info.UsesFlatScratch |= I->second.UsesFlatScratch; 467*fe6060f1SDimitry Andric Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 468*fe6060f1SDimitry Andric Info.HasRecursion |= I->second.HasRecursion; 469*fe6060f1SDimitry Andric Info.HasIndirectCall |= I->second.HasIndirectCall; 470*fe6060f1SDimitry Andric } 471*fe6060f1SDimitry Andric 472*fe6060f1SDimitry Andric // FIXME: Call site could have norecurse on it 473*fe6060f1SDimitry Andric if (!Callee || !Callee->doesNotRecurse()) 474*fe6060f1SDimitry Andric Info.HasRecursion = true; 475*fe6060f1SDimitry Andric } 476*fe6060f1SDimitry Andric } 477*fe6060f1SDimitry Andric } 478*fe6060f1SDimitry Andric 479*fe6060f1SDimitry Andric Info.NumExplicitSGPR = MaxSGPR + 1; 480*fe6060f1SDimitry Andric Info.NumVGPR = MaxVGPR + 1; 481*fe6060f1SDimitry Andric Info.NumAGPR = MaxAGPR + 1; 482*fe6060f1SDimitry Andric Info.PrivateSegmentSize += CalleeFrameSize; 483*fe6060f1SDimitry Andric 484*fe6060f1SDimitry Andric return Info; 485*fe6060f1SDimitry Andric } 486*fe6060f1SDimitry Andric 487*fe6060f1SDimitry Andric void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 488*fe6060f1SDimitry Andric // Collect the maximum number of registers from non-hardware-entrypoints. 489*fe6060f1SDimitry Andric // All these functions are potential targets for indirect calls. 490*fe6060f1SDimitry Andric int32_t NonKernelMaxSGPRs = 0; 491*fe6060f1SDimitry Andric int32_t NonKernelMaxVGPRs = 0; 492*fe6060f1SDimitry Andric int32_t NonKernelMaxAGPRs = 0; 493*fe6060f1SDimitry Andric 494*fe6060f1SDimitry Andric for (const auto &I : CallGraphResourceInfo) { 495*fe6060f1SDimitry Andric if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 496*fe6060f1SDimitry Andric auto &Info = I.getSecond(); 497*fe6060f1SDimitry Andric NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 498*fe6060f1SDimitry Andric NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 499*fe6060f1SDimitry Andric NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 500*fe6060f1SDimitry Andric } 501*fe6060f1SDimitry Andric } 502*fe6060f1SDimitry Andric 503*fe6060f1SDimitry Andric // Add register usage for functions with indirect calls. 504*fe6060f1SDimitry Andric // For calls to unknown functions, we assume the maximum register usage of 505*fe6060f1SDimitry Andric // all non-hardware-entrypoints in the current module. 506*fe6060f1SDimitry Andric for (auto &I : CallGraphResourceInfo) { 507*fe6060f1SDimitry Andric auto &Info = I.getSecond(); 508*fe6060f1SDimitry Andric if (Info.HasIndirectCall) { 509*fe6060f1SDimitry Andric Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 510*fe6060f1SDimitry Andric Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 511*fe6060f1SDimitry Andric Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 512*fe6060f1SDimitry Andric } 513*fe6060f1SDimitry Andric } 514*fe6060f1SDimitry Andric } 515