xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1fe6060f1SDimitry Andric //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2fe6060f1SDimitry Andric //
3fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6fe6060f1SDimitry Andric //
7fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
8fe6060f1SDimitry Andric //
9fe6060f1SDimitry Andric /// \file
10fe6060f1SDimitry Andric /// \brief Analyzes how many registers and other resources are used by
11fe6060f1SDimitry Andric /// functions.
12fe6060f1SDimitry Andric ///
13fe6060f1SDimitry Andric /// The results of this analysis are used to fill the register usage, flat
14fe6060f1SDimitry Andric /// usage, etc. into hardware registers.
15fe6060f1SDimitry Andric ///
16fe6060f1SDimitry Andric /// The analysis takes callees into account. E.g. if a function A that needs 10
17fe6060f1SDimitry Andric /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18fe6060f1SDimitry Andric /// will return 20.
19fe6060f1SDimitry Andric /// It is assumed that an indirect call can go into any function except
20fe6060f1SDimitry Andric /// hardware-entrypoints. Therefore the register usage of functions with
21fe6060f1SDimitry Andric /// indirect calls is estimated as the maximum of all non-entrypoint functions
22fe6060f1SDimitry Andric /// in the module.
23fe6060f1SDimitry Andric ///
24fe6060f1SDimitry Andric //===----------------------------------------------------------------------===//
25fe6060f1SDimitry Andric 
26fe6060f1SDimitry Andric #include "AMDGPUResourceUsageAnalysis.h"
27fe6060f1SDimitry Andric #include "AMDGPU.h"
28fe6060f1SDimitry Andric #include "GCNSubtarget.h"
29fe6060f1SDimitry Andric #include "SIMachineFunctionInfo.h"
30*81ad6265SDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
31fe6060f1SDimitry Andric #include "llvm/Analysis/CallGraph.h"
32*81ad6265SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h"
33fe6060f1SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
34349cc55cSDimitry Andric #include "llvm/IR/GlobalAlias.h"
35349cc55cSDimitry Andric #include "llvm/IR/GlobalValue.h"
36fe6060f1SDimitry Andric #include "llvm/Target/TargetMachine.h"
37fe6060f1SDimitry Andric 
38fe6060f1SDimitry Andric using namespace llvm;
39fe6060f1SDimitry Andric using namespace llvm::AMDGPU;
40fe6060f1SDimitry Andric 
41fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-resource-usage"
42fe6060f1SDimitry Andric 
43fe6060f1SDimitry Andric char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44fe6060f1SDimitry Andric char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45fe6060f1SDimitry Andric 
46fe6060f1SDimitry Andric // We need to tell the runtime some amount ahead of time if we don't know the
47fe6060f1SDimitry Andric // true stack size. Assume a smaller number if this is only due to dynamic /
48fe6060f1SDimitry Andric // non-entry block allocas.
49fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
50fe6060f1SDimitry Andric     "amdgpu-assume-external-call-stack-size",
51fe6060f1SDimitry Andric     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52fe6060f1SDimitry Andric     cl::init(16384));
53fe6060f1SDimitry Andric 
54fe6060f1SDimitry Andric static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
55fe6060f1SDimitry Andric     "amdgpu-assume-dynamic-stack-object-size",
56fe6060f1SDimitry Andric     cl::desc("Assumed extra stack use if there are any "
57fe6060f1SDimitry Andric              "variable sized objects (in bytes)"),
58fe6060f1SDimitry Andric     cl::Hidden, cl::init(4096));
59fe6060f1SDimitry Andric 
60fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61fe6060f1SDimitry Andric                 "Function register usage analysis", true, true)
62fe6060f1SDimitry Andric 
63fe6060f1SDimitry Andric static const Function *getCalleeFunction(const MachineOperand &Op) {
64fe6060f1SDimitry Andric   if (Op.isImm()) {
65fe6060f1SDimitry Andric     assert(Op.getImm() == 0);
66fe6060f1SDimitry Andric     return nullptr;
67fe6060f1SDimitry Andric   }
68349cc55cSDimitry Andric   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69349cc55cSDimitry Andric     return cast<Function>(GA->getOperand(0));
70fe6060f1SDimitry Andric   return cast<Function>(Op.getGlobal());
71fe6060f1SDimitry Andric }
72fe6060f1SDimitry Andric 
73fe6060f1SDimitry Andric static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
74fe6060f1SDimitry Andric                                   const SIInstrInfo &TII, unsigned Reg) {
75fe6060f1SDimitry Andric   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76fe6060f1SDimitry Andric     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
77fe6060f1SDimitry Andric       return true;
78fe6060f1SDimitry Andric   }
79fe6060f1SDimitry Andric 
80fe6060f1SDimitry Andric   return false;
81fe6060f1SDimitry Andric }
82fe6060f1SDimitry Andric 
83fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
84fe6060f1SDimitry Andric     const GCNSubtarget &ST) const {
85fe6060f1SDimitry Andric   return NumExplicitSGPR +
86fe6060f1SDimitry Andric          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87fe6060f1SDimitry Andric                                    ST.getTargetID().isXnackOnOrAny());
88fe6060f1SDimitry Andric }
89fe6060f1SDimitry Andric 
90fe6060f1SDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91349cc55cSDimitry Andric     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92*81ad6265SDimitry Andric   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93349cc55cSDimitry Andric }
94349cc55cSDimitry Andric 
95349cc55cSDimitry Andric int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96fe6060f1SDimitry Andric     const GCNSubtarget &ST) const {
97349cc55cSDimitry Andric   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98fe6060f1SDimitry Andric }
99fe6060f1SDimitry Andric 
100*81ad6265SDimitry Andric bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101fe6060f1SDimitry Andric   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102fe6060f1SDimitry Andric   if (!TPC)
103fe6060f1SDimitry Andric     return false;
104fe6060f1SDimitry Andric 
105*81ad6265SDimitry Andric   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106fe6060f1SDimitry Andric   const TargetMachine &TM = TPC->getTM<TargetMachine>();
107fe6060f1SDimitry Andric   bool HasIndirectCall = false;
108fe6060f1SDimitry Andric 
109*81ad6265SDimitry Andric   CallGraph CG = CallGraph(M);
110*81ad6265SDimitry Andric   auto End = po_end(&CG);
111*81ad6265SDimitry Andric 
112*81ad6265SDimitry Andric   for (auto IT = po_begin(&CG); IT != End; ++IT) {
113*81ad6265SDimitry Andric     Function *F = IT->getFunction();
114fe6060f1SDimitry Andric     if (!F || F->isDeclaration())
115fe6060f1SDimitry Andric       continue;
116fe6060f1SDimitry Andric 
117*81ad6265SDimitry Andric     MachineFunction *MF = MMI.getMachineFunction(*F);
118*81ad6265SDimitry Andric     assert(MF && "function must have been generated already");
119fe6060f1SDimitry Andric 
120fe6060f1SDimitry Andric     auto CI = CallGraphResourceInfo.insert(
121*81ad6265SDimitry Andric         std::make_pair(F, SIFunctionResourceInfo()));
122fe6060f1SDimitry Andric     SIFunctionResourceInfo &Info = CI.first->second;
123fe6060f1SDimitry Andric     assert(CI.second && "should only be called once per function");
124*81ad6265SDimitry Andric     Info = analyzeResourceUsage(*MF, TM);
125fe6060f1SDimitry Andric     HasIndirectCall |= Info.HasIndirectCall;
126fe6060f1SDimitry Andric   }
127fe6060f1SDimitry Andric 
128fe6060f1SDimitry Andric   if (HasIndirectCall)
129fe6060f1SDimitry Andric     propagateIndirectCallRegisterUsage();
130fe6060f1SDimitry Andric 
131fe6060f1SDimitry Andric   return false;
132fe6060f1SDimitry Andric }
133fe6060f1SDimitry Andric 
134fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
135fe6060f1SDimitry Andric AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
136fe6060f1SDimitry Andric     const MachineFunction &MF, const TargetMachine &TM) const {
137fe6060f1SDimitry Andric   SIFunctionResourceInfo Info;
138fe6060f1SDimitry Andric 
139fe6060f1SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
140fe6060f1SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
141fe6060f1SDimitry Andric   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
142fe6060f1SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
143fe6060f1SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
144fe6060f1SDimitry Andric   const SIRegisterInfo &TRI = TII->getRegisterInfo();
145fe6060f1SDimitry Andric 
146fe6060f1SDimitry Andric   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
147fe6060f1SDimitry Andric                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
148fe6060f1SDimitry Andric                          MRI.isLiveIn(MFI->getPreloadedReg(
149fe6060f1SDimitry Andric                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
150fe6060f1SDimitry Andric 
151fe6060f1SDimitry Andric   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
152fe6060f1SDimitry Andric   // instructions aren't used to access the scratch buffer. Inline assembly may
153fe6060f1SDimitry Andric   // need it though.
154fe6060f1SDimitry Andric   //
155fe6060f1SDimitry Andric   // If we only have implicit uses of flat_scr on flat instructions, it is not
156fe6060f1SDimitry Andric   // really needed.
157fe6060f1SDimitry Andric   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
158fe6060f1SDimitry Andric       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
159fe6060f1SDimitry Andric        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
160fe6060f1SDimitry Andric        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
161fe6060f1SDimitry Andric     Info.UsesFlatScratch = false;
162fe6060f1SDimitry Andric   }
163fe6060f1SDimitry Andric 
164fe6060f1SDimitry Andric   Info.PrivateSegmentSize = FrameInfo.getStackSize();
165fe6060f1SDimitry Andric 
166fe6060f1SDimitry Andric   // Assume a big number if there are any unknown sized objects.
167fe6060f1SDimitry Andric   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
168fe6060f1SDimitry Andric   if (Info.HasDynamicallySizedStack)
169fe6060f1SDimitry Andric     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
170fe6060f1SDimitry Andric 
171fe6060f1SDimitry Andric   if (MFI->isStackRealigned())
172fe6060f1SDimitry Andric     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
173fe6060f1SDimitry Andric 
174fe6060f1SDimitry Andric   Info.UsesVCC =
175fe6060f1SDimitry Andric       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
176fe6060f1SDimitry Andric 
177fe6060f1SDimitry Andric   // If there are no calls, MachineRegisterInfo can tell us the used register
178fe6060f1SDimitry Andric   // count easily.
179fe6060f1SDimitry Andric   // A tail call isn't considered a call for MachineFrameInfo's purposes.
180fe6060f1SDimitry Andric   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
181fe6060f1SDimitry Andric     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
182fe6060f1SDimitry Andric     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
183fe6060f1SDimitry Andric       if (MRI.isPhysRegUsed(Reg)) {
184fe6060f1SDimitry Andric         HighestVGPRReg = Reg;
185fe6060f1SDimitry Andric         break;
186fe6060f1SDimitry Andric       }
187fe6060f1SDimitry Andric     }
188fe6060f1SDimitry Andric 
189fe6060f1SDimitry Andric     if (ST.hasMAIInsts()) {
190fe6060f1SDimitry Andric       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
191fe6060f1SDimitry Andric       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
192fe6060f1SDimitry Andric         if (MRI.isPhysRegUsed(Reg)) {
193fe6060f1SDimitry Andric           HighestAGPRReg = Reg;
194fe6060f1SDimitry Andric           break;
195fe6060f1SDimitry Andric         }
196fe6060f1SDimitry Andric       }
197fe6060f1SDimitry Andric       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
198fe6060f1SDimitry Andric                          ? 0
199fe6060f1SDimitry Andric                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
200fe6060f1SDimitry Andric     }
201fe6060f1SDimitry Andric 
202fe6060f1SDimitry Andric     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
203fe6060f1SDimitry Andric     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
204fe6060f1SDimitry Andric       if (MRI.isPhysRegUsed(Reg)) {
205fe6060f1SDimitry Andric         HighestSGPRReg = Reg;
206fe6060f1SDimitry Andric         break;
207fe6060f1SDimitry Andric       }
208fe6060f1SDimitry Andric     }
209fe6060f1SDimitry Andric 
210fe6060f1SDimitry Andric     // We found the maximum register index. They start at 0, so add one to get
211fe6060f1SDimitry Andric     // the number of registers.
212fe6060f1SDimitry Andric     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
213fe6060f1SDimitry Andric                        ? 0
214fe6060f1SDimitry Andric                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
215fe6060f1SDimitry Andric     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
216fe6060f1SDimitry Andric                                ? 0
217fe6060f1SDimitry Andric                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
218fe6060f1SDimitry Andric 
219fe6060f1SDimitry Andric     return Info;
220fe6060f1SDimitry Andric   }
221fe6060f1SDimitry Andric 
222fe6060f1SDimitry Andric   int32_t MaxVGPR = -1;
223fe6060f1SDimitry Andric   int32_t MaxAGPR = -1;
224fe6060f1SDimitry Andric   int32_t MaxSGPR = -1;
225fe6060f1SDimitry Andric   uint64_t CalleeFrameSize = 0;
226fe6060f1SDimitry Andric 
227fe6060f1SDimitry Andric   for (const MachineBasicBlock &MBB : MF) {
228fe6060f1SDimitry Andric     for (const MachineInstr &MI : MBB) {
229fe6060f1SDimitry Andric       // TODO: Check regmasks? Do they occur anywhere except calls?
230fe6060f1SDimitry Andric       for (const MachineOperand &MO : MI.operands()) {
231fe6060f1SDimitry Andric         unsigned Width = 0;
232fe6060f1SDimitry Andric         bool IsSGPR = false;
233fe6060f1SDimitry Andric         bool IsAGPR = false;
234fe6060f1SDimitry Andric 
235fe6060f1SDimitry Andric         if (!MO.isReg())
236fe6060f1SDimitry Andric           continue;
237fe6060f1SDimitry Andric 
238fe6060f1SDimitry Andric         Register Reg = MO.getReg();
239fe6060f1SDimitry Andric         switch (Reg) {
240fe6060f1SDimitry Andric         case AMDGPU::EXEC:
241fe6060f1SDimitry Andric         case AMDGPU::EXEC_LO:
242fe6060f1SDimitry Andric         case AMDGPU::EXEC_HI:
243fe6060f1SDimitry Andric         case AMDGPU::SCC:
244fe6060f1SDimitry Andric         case AMDGPU::M0:
245fe6060f1SDimitry Andric         case AMDGPU::M0_LO16:
246fe6060f1SDimitry Andric         case AMDGPU::M0_HI16:
247fe6060f1SDimitry Andric         case AMDGPU::SRC_SHARED_BASE:
248fe6060f1SDimitry Andric         case AMDGPU::SRC_SHARED_LIMIT:
249fe6060f1SDimitry Andric         case AMDGPU::SRC_PRIVATE_BASE:
250fe6060f1SDimitry Andric         case AMDGPU::SRC_PRIVATE_LIMIT:
251fe6060f1SDimitry Andric         case AMDGPU::SGPR_NULL:
252*81ad6265SDimitry Andric         case AMDGPU::SGPR_NULL64:
253fe6060f1SDimitry Andric         case AMDGPU::MODE:
254fe6060f1SDimitry Andric           continue;
255fe6060f1SDimitry Andric 
256fe6060f1SDimitry Andric         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
257fe6060f1SDimitry Andric           llvm_unreachable("src_pops_exiting_wave_id should not be used");
258fe6060f1SDimitry Andric 
259fe6060f1SDimitry Andric         case AMDGPU::NoRegister:
260fe6060f1SDimitry Andric           assert(MI.isDebugInstr() &&
261fe6060f1SDimitry Andric                  "Instruction uses invalid noreg register");
262fe6060f1SDimitry Andric           continue;
263fe6060f1SDimitry Andric 
264fe6060f1SDimitry Andric         case AMDGPU::VCC:
265fe6060f1SDimitry Andric         case AMDGPU::VCC_LO:
266fe6060f1SDimitry Andric         case AMDGPU::VCC_HI:
267fe6060f1SDimitry Andric         case AMDGPU::VCC_LO_LO16:
268fe6060f1SDimitry Andric         case AMDGPU::VCC_LO_HI16:
269fe6060f1SDimitry Andric         case AMDGPU::VCC_HI_LO16:
270fe6060f1SDimitry Andric         case AMDGPU::VCC_HI_HI16:
271fe6060f1SDimitry Andric           Info.UsesVCC = true;
272fe6060f1SDimitry Andric           continue;
273fe6060f1SDimitry Andric 
274fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR:
275fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR_LO:
276fe6060f1SDimitry Andric         case AMDGPU::FLAT_SCR_HI:
277fe6060f1SDimitry Andric           continue;
278fe6060f1SDimitry Andric 
279fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK:
280fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK_LO:
281fe6060f1SDimitry Andric         case AMDGPU::XNACK_MASK_HI:
282fe6060f1SDimitry Andric           llvm_unreachable("xnack_mask registers should not be used");
283fe6060f1SDimitry Andric 
284fe6060f1SDimitry Andric         case AMDGPU::LDS_DIRECT:
285fe6060f1SDimitry Andric           llvm_unreachable("lds_direct register should not be used");
286fe6060f1SDimitry Andric 
287fe6060f1SDimitry Andric         case AMDGPU::TBA:
288fe6060f1SDimitry Andric         case AMDGPU::TBA_LO:
289fe6060f1SDimitry Andric         case AMDGPU::TBA_HI:
290fe6060f1SDimitry Andric         case AMDGPU::TMA:
291fe6060f1SDimitry Andric         case AMDGPU::TMA_LO:
292fe6060f1SDimitry Andric         case AMDGPU::TMA_HI:
293fe6060f1SDimitry Andric           llvm_unreachable("trap handler registers should not be used");
294fe6060f1SDimitry Andric 
295fe6060f1SDimitry Andric         case AMDGPU::SRC_VCCZ:
296fe6060f1SDimitry Andric           llvm_unreachable("src_vccz register should not be used");
297fe6060f1SDimitry Andric 
298fe6060f1SDimitry Andric         case AMDGPU::SRC_EXECZ:
299fe6060f1SDimitry Andric           llvm_unreachable("src_execz register should not be used");
300fe6060f1SDimitry Andric 
301fe6060f1SDimitry Andric         case AMDGPU::SRC_SCC:
302fe6060f1SDimitry Andric           llvm_unreachable("src_scc register should not be used");
303fe6060f1SDimitry Andric 
304fe6060f1SDimitry Andric         default:
305fe6060f1SDimitry Andric           break;
306fe6060f1SDimitry Andric         }
307fe6060f1SDimitry Andric 
308fe6060f1SDimitry Andric         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
309fe6060f1SDimitry Andric             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
310fe6060f1SDimitry Andric             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
311fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
312fe6060f1SDimitry Andric                  "trap handler registers should not be used");
313fe6060f1SDimitry Andric           IsSGPR = true;
314fe6060f1SDimitry Andric           Width = 1;
315fe6060f1SDimitry Andric         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
316fe6060f1SDimitry Andric                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
317fe6060f1SDimitry Andric                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
318fe6060f1SDimitry Andric           IsSGPR = false;
319fe6060f1SDimitry Andric           Width = 1;
320fe6060f1SDimitry Andric         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
321fe6060f1SDimitry Andric                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
322fe6060f1SDimitry Andric           IsSGPR = false;
323fe6060f1SDimitry Andric           IsAGPR = true;
324fe6060f1SDimitry Andric           Width = 1;
325fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
326fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
327fe6060f1SDimitry Andric                  "trap handler registers should not be used");
328fe6060f1SDimitry Andric           IsSGPR = true;
329fe6060f1SDimitry Andric           Width = 2;
330fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
331fe6060f1SDimitry Andric           IsSGPR = false;
332fe6060f1SDimitry Andric           Width = 2;
333fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
334fe6060f1SDimitry Andric           IsSGPR = false;
335fe6060f1SDimitry Andric           IsAGPR = true;
336fe6060f1SDimitry Andric           Width = 2;
337fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
338fe6060f1SDimitry Andric           IsSGPR = false;
339fe6060f1SDimitry Andric           Width = 3;
340fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
341fe6060f1SDimitry Andric           IsSGPR = true;
342fe6060f1SDimitry Andric           Width = 3;
343fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
344fe6060f1SDimitry Andric           IsSGPR = false;
345fe6060f1SDimitry Andric           IsAGPR = true;
346fe6060f1SDimitry Andric           Width = 3;
347fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
348fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
349fe6060f1SDimitry Andric                  "trap handler registers should not be used");
350fe6060f1SDimitry Andric           IsSGPR = true;
351fe6060f1SDimitry Andric           Width = 4;
352fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
353fe6060f1SDimitry Andric           IsSGPR = false;
354fe6060f1SDimitry Andric           Width = 4;
355fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
356fe6060f1SDimitry Andric           IsSGPR = false;
357fe6060f1SDimitry Andric           IsAGPR = true;
358fe6060f1SDimitry Andric           Width = 4;
359fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
360fe6060f1SDimitry Andric           IsSGPR = false;
361fe6060f1SDimitry Andric           Width = 5;
362fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
363fe6060f1SDimitry Andric           IsSGPR = true;
364fe6060f1SDimitry Andric           Width = 5;
365fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
366fe6060f1SDimitry Andric           IsSGPR = false;
367fe6060f1SDimitry Andric           IsAGPR = true;
368fe6060f1SDimitry Andric           Width = 5;
369fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
370fe6060f1SDimitry Andric           IsSGPR = false;
371fe6060f1SDimitry Andric           Width = 6;
372fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
373fe6060f1SDimitry Andric           IsSGPR = true;
374fe6060f1SDimitry Andric           Width = 6;
375fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
376fe6060f1SDimitry Andric           IsSGPR = false;
377fe6060f1SDimitry Andric           IsAGPR = true;
378fe6060f1SDimitry Andric           Width = 6;
379fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
380fe6060f1SDimitry Andric           IsSGPR = false;
381fe6060f1SDimitry Andric           Width = 7;
382fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
383fe6060f1SDimitry Andric           IsSGPR = true;
384fe6060f1SDimitry Andric           Width = 7;
385fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
386fe6060f1SDimitry Andric           IsSGPR = false;
387fe6060f1SDimitry Andric           IsAGPR = true;
388fe6060f1SDimitry Andric           Width = 7;
389fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
390fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
391fe6060f1SDimitry Andric                  "trap handler registers should not be used");
392fe6060f1SDimitry Andric           IsSGPR = true;
393fe6060f1SDimitry Andric           Width = 8;
394fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
395fe6060f1SDimitry Andric           IsSGPR = false;
396fe6060f1SDimitry Andric           Width = 8;
397fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
398fe6060f1SDimitry Andric           IsSGPR = false;
399fe6060f1SDimitry Andric           IsAGPR = true;
400fe6060f1SDimitry Andric           Width = 8;
401fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
402fe6060f1SDimitry Andric           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
403fe6060f1SDimitry Andric                  "trap handler registers should not be used");
404fe6060f1SDimitry Andric           IsSGPR = true;
405fe6060f1SDimitry Andric           Width = 16;
406fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
407fe6060f1SDimitry Andric           IsSGPR = false;
408fe6060f1SDimitry Andric           Width = 16;
409fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
410fe6060f1SDimitry Andric           IsSGPR = false;
411fe6060f1SDimitry Andric           IsAGPR = true;
412fe6060f1SDimitry Andric           Width = 16;
413fe6060f1SDimitry Andric         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
414fe6060f1SDimitry Andric           IsSGPR = true;
415fe6060f1SDimitry Andric           Width = 32;
416fe6060f1SDimitry Andric         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
417fe6060f1SDimitry Andric           IsSGPR = false;
418fe6060f1SDimitry Andric           Width = 32;
419fe6060f1SDimitry Andric         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
420fe6060f1SDimitry Andric           IsSGPR = false;
421fe6060f1SDimitry Andric           IsAGPR = true;
422fe6060f1SDimitry Andric           Width = 32;
423fe6060f1SDimitry Andric         } else {
424fe6060f1SDimitry Andric           llvm_unreachable("Unknown register class");
425fe6060f1SDimitry Andric         }
426fe6060f1SDimitry Andric         unsigned HWReg = TRI.getHWRegIndex(Reg);
427fe6060f1SDimitry Andric         int MaxUsed = HWReg + Width - 1;
428fe6060f1SDimitry Andric         if (IsSGPR) {
429fe6060f1SDimitry Andric           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
430fe6060f1SDimitry Andric         } else if (IsAGPR) {
431fe6060f1SDimitry Andric           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
432fe6060f1SDimitry Andric         } else {
433fe6060f1SDimitry Andric           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
434fe6060f1SDimitry Andric         }
435fe6060f1SDimitry Andric       }
436fe6060f1SDimitry Andric 
437fe6060f1SDimitry Andric       if (MI.isCall()) {
438fe6060f1SDimitry Andric         // Pseudo used just to encode the underlying global. Is there a better
439fe6060f1SDimitry Andric         // way to track this?
440fe6060f1SDimitry Andric 
441fe6060f1SDimitry Andric         const MachineOperand *CalleeOp =
442fe6060f1SDimitry Andric             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
443fe6060f1SDimitry Andric 
444fe6060f1SDimitry Andric         const Function *Callee = getCalleeFunction(*CalleeOp);
445fe6060f1SDimitry Andric         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
446fe6060f1SDimitry Andric             CallGraphResourceInfo.end();
447fe6060f1SDimitry Andric 
448fe6060f1SDimitry Andric         // Avoid crashing on undefined behavior with an illegal call to a
449fe6060f1SDimitry Andric         // kernel. If a callsite's calling convention doesn't match the
450fe6060f1SDimitry Andric         // function's, it's undefined behavior. If the callsite calling
451fe6060f1SDimitry Andric         // convention does match, that would have errored earlier.
452fe6060f1SDimitry Andric         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
453fe6060f1SDimitry Andric           report_fatal_error("invalid call to entry function");
454fe6060f1SDimitry Andric 
455fe6060f1SDimitry Andric         bool IsIndirect = !Callee || Callee->isDeclaration();
456fe6060f1SDimitry Andric         if (!IsIndirect)
457fe6060f1SDimitry Andric           I = CallGraphResourceInfo.find(Callee);
458fe6060f1SDimitry Andric 
459349cc55cSDimitry Andric         // FIXME: Call site could have norecurse on it
460349cc55cSDimitry Andric         if (!Callee || !Callee->doesNotRecurse()) {
461349cc55cSDimitry Andric           Info.HasRecursion = true;
462349cc55cSDimitry Andric 
463349cc55cSDimitry Andric           // TODO: If we happen to know there is no stack usage in the
464349cc55cSDimitry Andric           // callgraph, we don't need to assume an infinitely growing stack.
465349cc55cSDimitry Andric           if (!MI.isReturn()) {
466349cc55cSDimitry Andric             // We don't need to assume an unknown stack size for tail calls.
467349cc55cSDimitry Andric 
468349cc55cSDimitry Andric             // FIXME: This only benefits in the case where the kernel does not
469349cc55cSDimitry Andric             // directly call the tail called function. If a kernel directly
470349cc55cSDimitry Andric             // calls a tail recursive function, we'll assume maximum stack size
471349cc55cSDimitry Andric             // based on the regular call instruction.
472349cc55cSDimitry Andric             CalleeFrameSize =
473349cc55cSDimitry Andric               std::max(CalleeFrameSize,
474349cc55cSDimitry Andric                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
475349cc55cSDimitry Andric           }
476349cc55cSDimitry Andric         }
477349cc55cSDimitry Andric 
478fe6060f1SDimitry Andric         if (IsIndirect || I == CallGraphResourceInfo.end()) {
479fe6060f1SDimitry Andric           CalleeFrameSize =
480fe6060f1SDimitry Andric               std::max(CalleeFrameSize,
481fe6060f1SDimitry Andric                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
482fe6060f1SDimitry Andric 
483fe6060f1SDimitry Andric           // Register usage of indirect calls gets handled later
484fe6060f1SDimitry Andric           Info.UsesVCC = true;
485fe6060f1SDimitry Andric           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
486fe6060f1SDimitry Andric           Info.HasDynamicallySizedStack = true;
487fe6060f1SDimitry Andric           Info.HasIndirectCall = true;
488fe6060f1SDimitry Andric         } else {
489fe6060f1SDimitry Andric           // We force CodeGen to run in SCC order, so the callee's register
490fe6060f1SDimitry Andric           // usage etc. should be the cumulative usage of all callees.
491fe6060f1SDimitry Andric           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
492fe6060f1SDimitry Andric           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
493fe6060f1SDimitry Andric           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
494fe6060f1SDimitry Andric           CalleeFrameSize =
495fe6060f1SDimitry Andric               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
496fe6060f1SDimitry Andric           Info.UsesVCC |= I->second.UsesVCC;
497fe6060f1SDimitry Andric           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
498fe6060f1SDimitry Andric           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
499fe6060f1SDimitry Andric           Info.HasRecursion |= I->second.HasRecursion;
500fe6060f1SDimitry Andric           Info.HasIndirectCall |= I->second.HasIndirectCall;
501fe6060f1SDimitry Andric         }
502fe6060f1SDimitry Andric       }
503fe6060f1SDimitry Andric     }
504fe6060f1SDimitry Andric   }
505fe6060f1SDimitry Andric 
506fe6060f1SDimitry Andric   Info.NumExplicitSGPR = MaxSGPR + 1;
507fe6060f1SDimitry Andric   Info.NumVGPR = MaxVGPR + 1;
508fe6060f1SDimitry Andric   Info.NumAGPR = MaxAGPR + 1;
509fe6060f1SDimitry Andric   Info.PrivateSegmentSize += CalleeFrameSize;
510fe6060f1SDimitry Andric 
511fe6060f1SDimitry Andric   return Info;
512fe6060f1SDimitry Andric }
513fe6060f1SDimitry Andric 
514fe6060f1SDimitry Andric void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
515fe6060f1SDimitry Andric   // Collect the maximum number of registers from non-hardware-entrypoints.
516fe6060f1SDimitry Andric   // All these functions are potential targets for indirect calls.
517fe6060f1SDimitry Andric   int32_t NonKernelMaxSGPRs = 0;
518fe6060f1SDimitry Andric   int32_t NonKernelMaxVGPRs = 0;
519fe6060f1SDimitry Andric   int32_t NonKernelMaxAGPRs = 0;
520fe6060f1SDimitry Andric 
521fe6060f1SDimitry Andric   for (const auto &I : CallGraphResourceInfo) {
522fe6060f1SDimitry Andric     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
523fe6060f1SDimitry Andric       auto &Info = I.getSecond();
524fe6060f1SDimitry Andric       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
525fe6060f1SDimitry Andric       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
526fe6060f1SDimitry Andric       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
527fe6060f1SDimitry Andric     }
528fe6060f1SDimitry Andric   }
529fe6060f1SDimitry Andric 
530fe6060f1SDimitry Andric   // Add register usage for functions with indirect calls.
531fe6060f1SDimitry Andric   // For calls to unknown functions, we assume the maximum register usage of
532fe6060f1SDimitry Andric   // all non-hardware-entrypoints in the current module.
533fe6060f1SDimitry Andric   for (auto &I : CallGraphResourceInfo) {
534fe6060f1SDimitry Andric     auto &Info = I.getSecond();
535fe6060f1SDimitry Andric     if (Info.HasIndirectCall) {
536fe6060f1SDimitry Andric       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
537fe6060f1SDimitry Andric       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
538fe6060f1SDimitry Andric       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
539fe6060f1SDimitry Andric     }
540fe6060f1SDimitry Andric   }
541fe6060f1SDimitry Andric }
542