xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision fe6060f10f634930ff71b7c50291ddc610da2475)
1*fe6060f1SDimitry Andric //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2*fe6060f1SDimitry Andric //
3*fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*fe6060f1SDimitry Andric //
7*fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
8*fe6060f1SDimitry Andric //
9*fe6060f1SDimitry Andric /// \file
10*fe6060f1SDimitry Andric /// \brief Analyzes how many registers and other resources are used by
11*fe6060f1SDimitry Andric /// functions.
12*fe6060f1SDimitry Andric ///
13*fe6060f1SDimitry Andric /// The results of this analysis are used to fill the register usage, flat
14*fe6060f1SDimitry Andric /// usage, etc. into hardware registers.
15*fe6060f1SDimitry Andric ///
16*fe6060f1SDimitry Andric /// The analysis takes callees into account. E.g. if a function A that needs 10
17*fe6060f1SDimitry Andric /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18*fe6060f1SDimitry Andric /// will return 20.
19*fe6060f1SDimitry Andric /// It is assumed that an indirect call can go into any function except
20*fe6060f1SDimitry Andric /// hardware-entrypoints. Therefore the register usage of functions with
21*fe6060f1SDimitry Andric /// indirect calls is estimated as the maximum of all non-entrypoint functions
22*fe6060f1SDimitry Andric /// in the module.
23*fe6060f1SDimitry Andric ///
24*fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
25*fe6060f1SDimitry Andric 
26*fe6060f1SDimitry Andric #include "AMDGPUResourceUsageAnalysis.h"
27*fe6060f1SDimitry Andric #include "AMDGPU.h"
28*fe6060f1SDimitry Andric #include "GCNSubtarget.h"
29*fe6060f1SDimitry Andric #include "SIMachineFunctionInfo.h"
30*fe6060f1SDimitry Andric #include "llvm/Analysis/CallGraph.h"
31*fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
32*fe6060f1SDimitry Andric #include "llvm/Target/TargetMachine.h"
33*fe6060f1SDimitry Andric 
34*fe6060f1SDimitry Andric using namespace llvm;
35*fe6060f1SDimitry Andric using namespace llvm::AMDGPU;
36*fe6060f1SDimitry Andric 
37*fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-resource-usage"
38*fe6060f1SDimitry Andric 
39*fe6060f1SDimitry Andric char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
40*fe6060f1SDimitry Andric char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
41*fe6060f1SDimitry Andric 
42*fe6060f1SDimitry Andric // We need to tell the runtime some amount ahead of time if we don't know the
43*fe6060f1SDimitry Andric // true stack size. Assume a smaller number if this is only due to dynamic /
44*fe6060f1SDimitry Andric // non-entry block allocas.
45*fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
46*fe6060f1SDimitry Andric     "amdgpu-assume-external-call-stack-size",
47*fe6060f1SDimitry Andric     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
48*fe6060f1SDimitry Andric     cl::init(16384));
49*fe6060f1SDimitry Andric 
50*fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
51*fe6060f1SDimitry Andric     "amdgpu-assume-dynamic-stack-object-size",
52*fe6060f1SDimitry Andric     cl::desc("Assumed extra stack use if there are any "
53*fe6060f1SDimitry Andric              "variable sized objects (in bytes)"),
54*fe6060f1SDimitry Andric     cl::Hidden, cl::init(4096));
55*fe6060f1SDimitry Andric 
56*fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
57*fe6060f1SDimitry Andric                 "Function register usage analysis", true, true)
58*fe6060f1SDimitry Andric 
59*fe6060f1SDimitry Andric static const Function *getCalleeFunction(const MachineOperand &Op) {
60*fe6060f1SDimitry Andric   if (Op.isImm()) {
61*fe6060f1SDimitry Andric     assert(Op.getImm() == 0);
62*fe6060f1SDimitry Andric     return nullptr;
63*fe6060f1SDimitry Andric   }
64*fe6060f1SDimitry Andric 
65*fe6060f1SDimitry Andric   return cast<Function>(Op.getGlobal());
66*fe6060f1SDimitry Andric }
67*fe6060f1SDimitry Andric 
68*fe6060f1SDimitry Andric static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
69*fe6060f1SDimitry Andric                                   const SIInstrInfo &TII, unsigned Reg) {
70*fe6060f1SDimitry Andric   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
71*fe6060f1SDimitry Andric     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
72*fe6060f1SDimitry Andric       return true;
73*fe6060f1SDimitry Andric   }
74*fe6060f1SDimitry Andric 
75*fe6060f1SDimitry Andric   return false;
76*fe6060f1SDimitry Andric }
77*fe6060f1SDimitry Andric 
78*fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
79*fe6060f1SDimitry Andric     const GCNSubtarget &ST) const {
80*fe6060f1SDimitry Andric   return NumExplicitSGPR +
81*fe6060f1SDimitry Andric          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
82*fe6060f1SDimitry Andric                                    ST.getTargetID().isXnackOnOrAny());
83*fe6060f1SDimitry Andric }
84*fe6060f1SDimitry Andric 
85*fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
86*fe6060f1SDimitry Andric     const GCNSubtarget &ST) const {
87*fe6060f1SDimitry Andric   if (ST.hasGFX90AInsts() && NumAGPR)
88*fe6060f1SDimitry Andric     return alignTo(NumVGPR, 4) + NumAGPR;
89*fe6060f1SDimitry Andric   return std::max(NumVGPR, NumAGPR);
90*fe6060f1SDimitry Andric }
91*fe6060f1SDimitry Andric 
92*fe6060f1SDimitry Andric bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
93*fe6060f1SDimitry Andric   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
94*fe6060f1SDimitry Andric   if (!TPC)
95*fe6060f1SDimitry Andric     return false;
96*fe6060f1SDimitry Andric 
97*fe6060f1SDimitry Andric   const TargetMachine &TM = TPC->getTM<TargetMachine>();
98*fe6060f1SDimitry Andric   bool HasIndirectCall = false;
99*fe6060f1SDimitry Andric 
100*fe6060f1SDimitry Andric   for (CallGraphNode *I : SCC) {
101*fe6060f1SDimitry Andric     Function *F = I->getFunction();
102*fe6060f1SDimitry Andric     if (!F || F->isDeclaration())
103*fe6060f1SDimitry Andric       continue;
104*fe6060f1SDimitry Andric 
105*fe6060f1SDimitry Andric     MachineModuleInfo &MMI =
106*fe6060f1SDimitry Andric         getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
107*fe6060f1SDimitry Andric     MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
108*fe6060f1SDimitry Andric 
109*fe6060f1SDimitry Andric     auto CI = CallGraphResourceInfo.insert(
110*fe6060f1SDimitry Andric         std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
111*fe6060f1SDimitry Andric     SIFunctionResourceInfo &Info = CI.first->second;
112*fe6060f1SDimitry Andric     assert(CI.second && "should only be called once per function");
113*fe6060f1SDimitry Andric     Info = analyzeResourceUsage(MF, TM);
114*fe6060f1SDimitry Andric     HasIndirectCall |= Info.HasIndirectCall;
115*fe6060f1SDimitry Andric   }
116*fe6060f1SDimitry Andric 
117*fe6060f1SDimitry Andric   if (HasIndirectCall)
118*fe6060f1SDimitry Andric     propagateIndirectCallRegisterUsage();
119*fe6060f1SDimitry Andric 
120*fe6060f1SDimitry Andric   return false;
121*fe6060f1SDimitry Andric }
122*fe6060f1SDimitry Andric 
123*fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
124*fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
125*fe6060f1SDimitry Andric     const MachineFunction &MF, const TargetMachine &TM) const {
126*fe6060f1SDimitry Andric   SIFunctionResourceInfo Info;
127*fe6060f1SDimitry Andric 
128*fe6060f1SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
129*fe6060f1SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
130*fe6060f1SDimitry Andric   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
131*fe6060f1SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
132*fe6060f1SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
133*fe6060f1SDimitry Andric   const SIRegisterInfo &TRI = TII->getRegisterInfo();
134*fe6060f1SDimitry Andric 
135*fe6060f1SDimitry Andric   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
136*fe6060f1SDimitry Andric                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
137*fe6060f1SDimitry Andric                          MRI.isLiveIn(MFI->getPreloadedReg(
138*fe6060f1SDimitry Andric                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
139*fe6060f1SDimitry Andric 
140*fe6060f1SDimitry Andric   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
141*fe6060f1SDimitry Andric   // instructions aren't used to access the scratch buffer. Inline assembly may
142*fe6060f1SDimitry Andric   // need it though.
143*fe6060f1SDimitry Andric   //
144*fe6060f1SDimitry Andric   // If we only have implicit uses of flat_scr on flat instructions, it is not
145*fe6060f1SDimitry Andric   // really needed.
146*fe6060f1SDimitry Andric   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
147*fe6060f1SDimitry Andric       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
148*fe6060f1SDimitry Andric        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
149*fe6060f1SDimitry Andric        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
150*fe6060f1SDimitry Andric     Info.UsesFlatScratch = false;
151*fe6060f1SDimitry Andric   }
152*fe6060f1SDimitry Andric 
153*fe6060f1SDimitry Andric   Info.PrivateSegmentSize = FrameInfo.getStackSize();
154*fe6060f1SDimitry Andric 
155*fe6060f1SDimitry Andric   // Assume a big number if there are any unknown sized objects.
156*fe6060f1SDimitry Andric   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
157*fe6060f1SDimitry Andric   if (Info.HasDynamicallySizedStack)
158*fe6060f1SDimitry Andric     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
159*fe6060f1SDimitry Andric 
160*fe6060f1SDimitry Andric   if (MFI->isStackRealigned())
161*fe6060f1SDimitry Andric     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
162*fe6060f1SDimitry Andric 
163*fe6060f1SDimitry Andric   Info.UsesVCC =
164*fe6060f1SDimitry Andric       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
165*fe6060f1SDimitry Andric 
166*fe6060f1SDimitry Andric   // If there are no calls, MachineRegisterInfo can tell us the used register
167*fe6060f1SDimitry Andric   // count easily.
168*fe6060f1SDimitry Andric   // A tail call isn't considered a call for MachineFrameInfo's purposes.
169*fe6060f1SDimitry Andric   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
170*fe6060f1SDimitry Andric     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
171*fe6060f1SDimitry Andric     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
172*fe6060f1SDimitry Andric       if (MRI.isPhysRegUsed(Reg)) {
173*fe6060f1SDimitry Andric         HighestVGPRReg = Reg;
174*fe6060f1SDimitry Andric         break;
175*fe6060f1SDimitry Andric       }
176*fe6060f1SDimitry Andric     }
177*fe6060f1SDimitry Andric 
178*fe6060f1SDimitry Andric     if (ST.hasMAIInsts()) {
179*fe6060f1SDimitry Andric       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
180*fe6060f1SDimitry Andric       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
181*fe6060f1SDimitry Andric         if (MRI.isPhysRegUsed(Reg)) {
182*fe6060f1SDimitry Andric           HighestAGPRReg = Reg;
183*fe6060f1SDimitry Andric           break;
184*fe6060f1SDimitry Andric         }
185*fe6060f1SDimitry Andric       }
186*fe6060f1SDimitry Andric       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
187*fe6060f1SDimitry Andric                          ? 0
188*fe6060f1SDimitry Andric                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
189*fe6060f1SDimitry Andric     }
190*fe6060f1SDimitry Andric 
191*fe6060f1SDimitry Andric     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
192*fe6060f1SDimitry Andric     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
193*fe6060f1SDimitry Andric       if (MRI.isPhysRegUsed(Reg)) {
194*fe6060f1SDimitry Andric         HighestSGPRReg = Reg;
195*fe6060f1SDimitry Andric         break;
196*fe6060f1SDimitry Andric       }
197*fe6060f1SDimitry Andric     }
198*fe6060f1SDimitry Andric 
199*fe6060f1SDimitry Andric     // We found the maximum register index. They start at 0, so add one to get
200*fe6060f1SDimitry Andric     // the number of registers.
201*fe6060f1SDimitry Andric     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
202*fe6060f1SDimitry Andric                        ? 0
203*fe6060f1SDimitry Andric                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
204*fe6060f1SDimitry Andric     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
205*fe6060f1SDimitry Andric                                ? 0
206*fe6060f1SDimitry Andric                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
207*fe6060f1SDimitry Andric 
208*fe6060f1SDimitry Andric     return Info;
209*fe6060f1SDimitry Andric   }
210*fe6060f1SDimitry Andric 
211*fe6060f1SDimitry Andric   int32_t MaxVGPR = -1;
212*fe6060f1SDimitry Andric   int32_t MaxAGPR = -1;
213*fe6060f1SDimitry Andric   int32_t MaxSGPR = -1;
214*fe6060f1SDimitry Andric   uint64_t CalleeFrameSize = 0;
215*fe6060f1SDimitry Andric 
216*fe6060f1SDimitry Andric   for (const MachineBasicBlock &MBB : MF) {
217*fe6060f1SDimitry Andric     for (const MachineInstr &MI : MBB) {
218*fe6060f1SDimitry Andric       // TODO: Check regmasks? Do they occur anywhere except calls?
219*fe6060f1SDimitry Andric       for (const MachineOperand &MO : MI.operands()) {
220*fe6060f1SDimitry Andric         unsigned Width = 0;
221*fe6060f1SDimitry Andric         bool IsSGPR = false;
222*fe6060f1SDimitry Andric         bool IsAGPR = false;
223*fe6060f1SDimitry Andric 
224*fe6060f1SDimitry Andric         if (!MO.isReg())
225*fe6060f1SDimitry Andric           continue;
226*fe6060f1SDimitry Andric 
227*fe6060f1SDimitry Andric         Register Reg = MO.getReg();
228*fe6060f1SDimitry Andric         switch (Reg) {
229*fe6060f1SDimitry Andric         case AMDGPU::EXEC:
230*fe6060f1SDimitry Andric         case AMDGPU::EXEC_LO:
231*fe6060f1SDimitry Andric         case AMDGPU::EXEC_HI:
232*fe6060f1SDimitry Andric         case AMDGPU::SCC:
233*fe6060f1SDimitry Andric         case AMDGPU::M0:
234*fe6060f1SDimitry Andric         case AMDGPU::M0_LO16:
235*fe6060f1SDimitry Andric         case AMDGPU::M0_HI16:
236*fe6060f1SDimitry Andric         case AMDGPU::SRC_SHARED_BASE:
237*fe6060f1SDimitry Andric         case AMDGPU::SRC_SHARED_LIMIT:
238*fe6060f1SDimitry Andric         case AMDGPU::SRC_PRIVATE_BASE:
239*fe6060f1SDimitry Andric         case AMDGPU::SRC_PRIVATE_LIMIT:
240*fe6060f1SDimitry Andric         case AMDGPU::SGPR_NULL:
241*fe6060f1SDimitry Andric         case AMDGPU::MODE:
242*fe6060f1SDimitry Andric           continue;
243*fe6060f1SDimitry Andric 
244*fe6060f1SDimitry Andric         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
245*fe6060f1SDimitry Andric           llvm_unreachable("src_pops_exiting_wave_id should not be used");
246*fe6060f1SDimitry Andric 
247*fe6060f1SDimitry Andric         case AMDGPU::NoRegister:
248*fe6060f1SDimitry Andric           assert(MI.isDebugInstr() &&
249*fe6060f1SDimitry Andric                  "Instruction uses invalid noreg register");
250*fe6060f1SDimitry Andric           continue;
251*fe6060f1SDimitry Andric 
252*fe6060f1SDimitry Andric         case AMDGPU::VCC:
253*fe6060f1SDimitry Andric         case AMDGPU::VCC_LO:
254*fe6060f1SDimitry Andric         case AMDGPU::VCC_HI:
255*fe6060f1SDimitry Andric         case AMDGPU::VCC_LO_LO16:
256*fe6060f1SDimitry Andric         case AMDGPU::VCC_LO_HI16:
257*fe6060f1SDimitry Andric         case AMDGPU::VCC_HI_LO16:
258*fe6060f1SDimitry Andric         case AMDGPU::VCC_HI_HI16:
259*fe6060f1SDimitry Andric           Info.UsesVCC = true;
260*fe6060f1SDimitry Andric           continue;
261*fe6060f1SDimitry Andric 
262*fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR:
263*fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR_LO:
264*fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR_HI:
265*fe6060f1SDimitry Andric           continue;
266*fe6060f1SDimitry Andric 
267*fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK:
268*fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK_LO:
269*fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK_HI:
270*fe6060f1SDimitry Andric           llvm_unreachable("xnack_mask registers should not be used");
271*fe6060f1SDimitry Andric 
272*fe6060f1SDimitry Andric         case AMDGPU::LDS_DIRECT:
273*fe6060f1SDimitry Andric           llvm_unreachable("lds_direct register should not be used");
274*fe6060f1SDimitry Andric 
275*fe6060f1SDimitry Andric         case AMDGPU::TBA:
276*fe6060f1SDimitry Andric         case AMDGPU::TBA_LO:
277*fe6060f1SDimitry Andric         case AMDGPU::TBA_HI:
278*fe6060f1SDimitry Andric         case AMDGPU::TMA:
279*fe6060f1SDimitry Andric         case AMDGPU::TMA_LO:
280*fe6060f1SDimitry Andric         case AMDGPU::TMA_HI:
281*fe6060f1SDimitry Andric           llvm_unreachable("trap handler registers should not be used");
282*fe6060f1SDimitry Andric 
283*fe6060f1SDimitry Andric         case AMDGPU::SRC_VCCZ:
284*fe6060f1SDimitry Andric           llvm_unreachable("src_vccz register should not be used");
285*fe6060f1SDimitry Andric 
286*fe6060f1SDimitry Andric         case AMDGPU::SRC_EXECZ:
287*fe6060f1SDimitry Andric           llvm_unreachable("src_execz register should not be used");
288*fe6060f1SDimitry Andric 
289*fe6060f1SDimitry Andric         case AMDGPU::SRC_SCC:
290*fe6060f1SDimitry Andric           llvm_unreachable("src_scc register should not be used");
291*fe6060f1SDimitry Andric 
292*fe6060f1SDimitry Andric         default:
293*fe6060f1SDimitry Andric           break;
294*fe6060f1SDimitry Andric         }
295*fe6060f1SDimitry Andric 
296*fe6060f1SDimitry Andric         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
297*fe6060f1SDimitry Andric             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
298*fe6060f1SDimitry Andric             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
299*fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
300*fe6060f1SDimitry Andric                  "trap handler registers should not be used");
301*fe6060f1SDimitry Andric           IsSGPR = true;
302*fe6060f1SDimitry Andric           Width = 1;
303*fe6060f1SDimitry Andric         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
304*fe6060f1SDimitry Andric                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
305*fe6060f1SDimitry Andric                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
306*fe6060f1SDimitry Andric           IsSGPR = false;
307*fe6060f1SDimitry Andric           Width = 1;
308*fe6060f1SDimitry Andric         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
309*fe6060f1SDimitry Andric                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
310*fe6060f1SDimitry Andric           IsSGPR = false;
311*fe6060f1SDimitry Andric           IsAGPR = true;
312*fe6060f1SDimitry Andric           Width = 1;
313*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
314*fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
315*fe6060f1SDimitry Andric                  "trap handler registers should not be used");
316*fe6060f1SDimitry Andric           IsSGPR = true;
317*fe6060f1SDimitry Andric           Width = 2;
318*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
319*fe6060f1SDimitry Andric           IsSGPR = false;
320*fe6060f1SDimitry Andric           Width = 2;
321*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
322*fe6060f1SDimitry Andric           IsSGPR = false;
323*fe6060f1SDimitry Andric           IsAGPR = true;
324*fe6060f1SDimitry Andric           Width = 2;
325*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
326*fe6060f1SDimitry Andric           IsSGPR = false;
327*fe6060f1SDimitry Andric           Width = 3;
328*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
329*fe6060f1SDimitry Andric           IsSGPR = true;
330*fe6060f1SDimitry Andric           Width = 3;
331*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
332*fe6060f1SDimitry Andric           IsSGPR = false;
333*fe6060f1SDimitry Andric           IsAGPR = true;
334*fe6060f1SDimitry Andric           Width = 3;
335*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
336*fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
337*fe6060f1SDimitry Andric                  "trap handler registers should not be used");
338*fe6060f1SDimitry Andric           IsSGPR = true;
339*fe6060f1SDimitry Andric           Width = 4;
340*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
341*fe6060f1SDimitry Andric           IsSGPR = false;
342*fe6060f1SDimitry Andric           Width = 4;
343*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
344*fe6060f1SDimitry Andric           IsSGPR = false;
345*fe6060f1SDimitry Andric           IsAGPR = true;
346*fe6060f1SDimitry Andric           Width = 4;
347*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
348*fe6060f1SDimitry Andric           IsSGPR = false;
349*fe6060f1SDimitry Andric           Width = 5;
350*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
351*fe6060f1SDimitry Andric           IsSGPR = true;
352*fe6060f1SDimitry Andric           Width = 5;
353*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
354*fe6060f1SDimitry Andric           IsSGPR = false;
355*fe6060f1SDimitry Andric           IsAGPR = true;
356*fe6060f1SDimitry Andric           Width = 5;
357*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
358*fe6060f1SDimitry Andric           IsSGPR = false;
359*fe6060f1SDimitry Andric           Width = 6;
360*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
361*fe6060f1SDimitry Andric           IsSGPR = true;
362*fe6060f1SDimitry Andric           Width = 6;
363*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
364*fe6060f1SDimitry Andric           IsSGPR = false;
365*fe6060f1SDimitry Andric           IsAGPR = true;
366*fe6060f1SDimitry Andric           Width = 6;
367*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
368*fe6060f1SDimitry Andric           IsSGPR = false;
369*fe6060f1SDimitry Andric           Width = 7;
370*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
371*fe6060f1SDimitry Andric           IsSGPR = true;
372*fe6060f1SDimitry Andric           Width = 7;
373*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
374*fe6060f1SDimitry Andric           IsSGPR = false;
375*fe6060f1SDimitry Andric           IsAGPR = true;
376*fe6060f1SDimitry Andric           Width = 7;
377*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
378*fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
379*fe6060f1SDimitry Andric                  "trap handler registers should not be used");
380*fe6060f1SDimitry Andric           IsSGPR = true;
381*fe6060f1SDimitry Andric           Width = 8;
382*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
383*fe6060f1SDimitry Andric           IsSGPR = false;
384*fe6060f1SDimitry Andric           Width = 8;
385*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
386*fe6060f1SDimitry Andric           IsSGPR = false;
387*fe6060f1SDimitry Andric           IsAGPR = true;
388*fe6060f1SDimitry Andric           Width = 8;
389*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
390*fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
391*fe6060f1SDimitry Andric                  "trap handler registers should not be used");
392*fe6060f1SDimitry Andric           IsSGPR = true;
393*fe6060f1SDimitry Andric           Width = 16;
394*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
395*fe6060f1SDimitry Andric           IsSGPR = false;
396*fe6060f1SDimitry Andric           Width = 16;
397*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
398*fe6060f1SDimitry Andric           IsSGPR = false;
399*fe6060f1SDimitry Andric           IsAGPR = true;
400*fe6060f1SDimitry Andric           Width = 16;
401*fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
402*fe6060f1SDimitry Andric           IsSGPR = true;
403*fe6060f1SDimitry Andric           Width = 32;
404*fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
405*fe6060f1SDimitry Andric           IsSGPR = false;
406*fe6060f1SDimitry Andric           Width = 32;
407*fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
408*fe6060f1SDimitry Andric           IsSGPR = false;
409*fe6060f1SDimitry Andric           IsAGPR = true;
410*fe6060f1SDimitry Andric           Width = 32;
411*fe6060f1SDimitry Andric         } else {
412*fe6060f1SDimitry Andric           llvm_unreachable("Unknown register class");
413*fe6060f1SDimitry Andric         }
414*fe6060f1SDimitry Andric         unsigned HWReg = TRI.getHWRegIndex(Reg);
415*fe6060f1SDimitry Andric         int MaxUsed = HWReg + Width - 1;
416*fe6060f1SDimitry Andric         if (IsSGPR) {
417*fe6060f1SDimitry Andric           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
418*fe6060f1SDimitry Andric         } else if (IsAGPR) {
419*fe6060f1SDimitry Andric           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
420*fe6060f1SDimitry Andric         } else {
421*fe6060f1SDimitry Andric           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
422*fe6060f1SDimitry Andric         }
423*fe6060f1SDimitry Andric       }
424*fe6060f1SDimitry Andric 
425*fe6060f1SDimitry Andric       if (MI.isCall()) {
426*fe6060f1SDimitry Andric         // Pseudo used just to encode the underlying global. Is there a better
427*fe6060f1SDimitry Andric         // way to track this?
428*fe6060f1SDimitry Andric 
429*fe6060f1SDimitry Andric         const MachineOperand *CalleeOp =
430*fe6060f1SDimitry Andric             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
431*fe6060f1SDimitry Andric 
432*fe6060f1SDimitry Andric         const Function *Callee = getCalleeFunction(*CalleeOp);
433*fe6060f1SDimitry Andric         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
434*fe6060f1SDimitry Andric             CallGraphResourceInfo.end();
435*fe6060f1SDimitry Andric 
436*fe6060f1SDimitry Andric         // Avoid crashing on undefined behavior with an illegal call to a
437*fe6060f1SDimitry Andric         // kernel. If a callsite's calling convention doesn't match the
438*fe6060f1SDimitry Andric         // function's, it's undefined behavior. If the callsite calling
439*fe6060f1SDimitry Andric         // convention does match, that would have errored earlier.
440*fe6060f1SDimitry Andric         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
441*fe6060f1SDimitry Andric           report_fatal_error("invalid call to entry function");
442*fe6060f1SDimitry Andric 
443*fe6060f1SDimitry Andric         bool IsIndirect = !Callee || Callee->isDeclaration();
444*fe6060f1SDimitry Andric         if (!IsIndirect)
445*fe6060f1SDimitry Andric           I = CallGraphResourceInfo.find(Callee);
446*fe6060f1SDimitry Andric 
447*fe6060f1SDimitry Andric         if (IsIndirect || I == CallGraphResourceInfo.end()) {
448*fe6060f1SDimitry Andric           CalleeFrameSize =
449*fe6060f1SDimitry Andric               std::max(CalleeFrameSize,
450*fe6060f1SDimitry Andric                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
451*fe6060f1SDimitry Andric 
452*fe6060f1SDimitry Andric           // Register usage of indirect calls gets handled later
453*fe6060f1SDimitry Andric           Info.UsesVCC = true;
454*fe6060f1SDimitry Andric           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
455*fe6060f1SDimitry Andric           Info.HasDynamicallySizedStack = true;
456*fe6060f1SDimitry Andric           Info.HasIndirectCall = true;
457*fe6060f1SDimitry Andric         } else {
458*fe6060f1SDimitry Andric           // We force CodeGen to run in SCC order, so the callee's register
459*fe6060f1SDimitry Andric           // usage etc. should be the cumulative usage of all callees.
460*fe6060f1SDimitry Andric           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
461*fe6060f1SDimitry Andric           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
462*fe6060f1SDimitry Andric           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
463*fe6060f1SDimitry Andric           CalleeFrameSize =
464*fe6060f1SDimitry Andric               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
465*fe6060f1SDimitry Andric           Info.UsesVCC |= I->second.UsesVCC;
466*fe6060f1SDimitry Andric           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
467*fe6060f1SDimitry Andric           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
468*fe6060f1SDimitry Andric           Info.HasRecursion |= I->second.HasRecursion;
469*fe6060f1SDimitry Andric           Info.HasIndirectCall |= I->second.HasIndirectCall;
470*fe6060f1SDimitry Andric         }
471*fe6060f1SDimitry Andric 
472*fe6060f1SDimitry Andric         // FIXME: Call site could have norecurse on it
473*fe6060f1SDimitry Andric         if (!Callee || !Callee->doesNotRecurse())
474*fe6060f1SDimitry Andric           Info.HasRecursion = true;
475*fe6060f1SDimitry Andric       }
476*fe6060f1SDimitry Andric     }
477*fe6060f1SDimitry Andric   }
478*fe6060f1SDimitry Andric 
479*fe6060f1SDimitry Andric   Info.NumExplicitSGPR = MaxSGPR + 1;
480*fe6060f1SDimitry Andric   Info.NumVGPR = MaxVGPR + 1;
481*fe6060f1SDimitry Andric   Info.NumAGPR = MaxAGPR + 1;
482*fe6060f1SDimitry Andric   Info.PrivateSegmentSize += CalleeFrameSize;
483*fe6060f1SDimitry Andric 
484*fe6060f1SDimitry Andric   return Info;
485*fe6060f1SDimitry Andric }
486*fe6060f1SDimitry Andric 
487*fe6060f1SDimitry Andric void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
488*fe6060f1SDimitry Andric   // Collect the maximum number of registers from non-hardware-entrypoints.
489*fe6060f1SDimitry Andric   // All these functions are potential targets for indirect calls.
490*fe6060f1SDimitry Andric   int32_t NonKernelMaxSGPRs = 0;
491*fe6060f1SDimitry Andric   int32_t NonKernelMaxVGPRs = 0;
492*fe6060f1SDimitry Andric   int32_t NonKernelMaxAGPRs = 0;
493*fe6060f1SDimitry Andric 
494*fe6060f1SDimitry Andric   for (const auto &I : CallGraphResourceInfo) {
495*fe6060f1SDimitry Andric     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
496*fe6060f1SDimitry Andric       auto &Info = I.getSecond();
497*fe6060f1SDimitry Andric       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
498*fe6060f1SDimitry Andric       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
499*fe6060f1SDimitry Andric       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
500*fe6060f1SDimitry Andric     }
501*fe6060f1SDimitry Andric   }
502*fe6060f1SDimitry Andric 
503*fe6060f1SDimitry Andric   // Add register usage for functions with indirect calls.
504*fe6060f1SDimitry Andric   // For calls to unknown functions, we assume the maximum register usage of
505*fe6060f1SDimitry Andric   // all non-hardware-entrypoints in the current module.
506*fe6060f1SDimitry Andric   for (auto &I : CallGraphResourceInfo) {
507*fe6060f1SDimitry Andric     auto &Info = I.getSecond();
508*fe6060f1SDimitry Andric     if (Info.HasIndirectCall) {
509*fe6060f1SDimitry Andric       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
510*fe6060f1SDimitry Andric       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
511*fe6060f1SDimitry Andric       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
512*fe6060f1SDimitry Andric     }
513*fe6060f1SDimitry Andric   }
514*fe6060f1SDimitry Andric }
515