xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/Analysis/CallGraph.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/TargetPassConfig.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/Target/TargetMachine.h"
37 
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40 
41 #define DEBUG_TYPE "amdgpu-resource-usage"
42 
43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45 
46 // In code object v4 and older, we need to tell the runtime some amount ahead of
47 // time if we don't know the true stack size. Assume a smaller number if this is
48 // only due to dynamic / non-entry block allocas.
49 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
50     "amdgpu-assume-external-call-stack-size",
51     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52     cl::init(16384));
53 
54 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
55     "amdgpu-assume-dynamic-stack-object-size",
56     cl::desc("Assumed extra stack use if there are any "
57              "variable sized objects (in bytes)"),
58     cl::Hidden, cl::init(4096));
59 
60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61                 "Function register usage analysis", true, true)
62 
63 static const Function *getCalleeFunction(const MachineOperand &Op) {
64   if (Op.isImm()) {
65     assert(Op.getImm() == 0);
66     return nullptr;
67   }
68   return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
69 }
70 
71 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72                                   const SIInstrInfo &TII, unsigned Reg) {
73   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
75       return true;
76   }
77 
78   return false;
79 }
80 
81 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82     const GCNSubtarget &ST) const {
83   return NumExplicitSGPR +
84          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
85                                    ST.getTargetID().isXnackOnOrAny());
86 }
87 
88 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
91 }
92 
93 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
94     const GCNSubtarget &ST) const {
95   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
96 }
97 
98 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
99   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
100   if (!TPC)
101     return false;
102 
103   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
104   const TargetMachine &TM = TPC->getTM<TargetMachine>();
105   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
106   bool HasIndirectCall = false;
107 
108   CallGraph CG = CallGraph(M);
109   auto End = po_end(&CG);
110 
111   // By default, for code object v5 and later, track only the minimum scratch
112   // size
113   uint32_t AssumedStackSizeForDynamicSizeObjects =
114       clAssumedStackSizeForDynamicSizeObjects;
115   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
116   if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
117       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
118     if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
119       AssumedStackSizeForDynamicSizeObjects = 0;
120     if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
121       AssumedStackSizeForExternalCall = 0;
122   }
123 
124   for (auto IT = po_begin(&CG); IT != End; ++IT) {
125     Function *F = IT->getFunction();
126     if (!F || F->isDeclaration())
127       continue;
128 
129     MachineFunction *MF = MMI.getMachineFunction(*F);
130     assert(MF && "function must have been generated already");
131 
132     auto CI =
133         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
134     SIFunctionResourceInfo &Info = CI.first->second;
135     assert(CI.second && "should only be called once per function");
136     Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
137                                 AssumedStackSizeForExternalCall);
138     HasIndirectCall |= Info.HasIndirectCall;
139   }
140 
141   // It's possible we have unreachable functions in the module which weren't
142   // visited by the PO traversal. Make sure we have some resource counts to
143   // report.
144   for (const auto &IT : CG) {
145     const Function *F = IT.first;
146     if (!F || F->isDeclaration())
147       continue;
148 
149     auto CI =
150         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
151     if (!CI.second) // Skip already visited functions
152       continue;
153 
154     SIFunctionResourceInfo &Info = CI.first->second;
155     MachineFunction *MF = MMI.getMachineFunction(*F);
156     assert(MF && "function must have been generated already");
157     Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
158                                 AssumedStackSizeForExternalCall);
159     HasIndirectCall |= Info.HasIndirectCall;
160   }
161 
162   if (HasIndirectCall)
163     propagateIndirectCallRegisterUsage();
164 
165   return false;
166 }
167 
168 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
169 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
170     const MachineFunction &MF, const TargetMachine &TM,
171     uint32_t AssumedStackSizeForDynamicSizeObjects,
172     uint32_t AssumedStackSizeForExternalCall) const {
173   SIFunctionResourceInfo Info;
174 
175   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
176   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
177   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
178   const MachineRegisterInfo &MRI = MF.getRegInfo();
179   const SIInstrInfo *TII = ST.getInstrInfo();
180   const SIRegisterInfo &TRI = TII->getRegisterInfo();
181 
182   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
183                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
184                          MRI.isLiveIn(MFI->getPreloadedReg(
185                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
186 
187   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
188   // instructions aren't used to access the scratch buffer. Inline assembly may
189   // need it though.
190   //
191   // If we only have implicit uses of flat_scr on flat instructions, it is not
192   // really needed.
193   if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
194       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
195        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
196        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
197     Info.UsesFlatScratch = false;
198   }
199 
200   Info.PrivateSegmentSize = FrameInfo.getStackSize();
201 
202   // Assume a big number if there are any unknown sized objects.
203   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
204   if (Info.HasDynamicallySizedStack)
205     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
206 
207   if (MFI->isStackRealigned())
208     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
209 
210   Info.UsesVCC =
211       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
212 
213   // If there are no calls, MachineRegisterInfo can tell us the used register
214   // count easily.
215   // A tail call isn't considered a call for MachineFrameInfo's purposes.
216   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
217     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
218     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
219       if (MRI.isPhysRegUsed(Reg)) {
220         HighestVGPRReg = Reg;
221         break;
222       }
223     }
224 
225     if (ST.hasMAIInsts()) {
226       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
227       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
228         if (MRI.isPhysRegUsed(Reg)) {
229           HighestAGPRReg = Reg;
230           break;
231         }
232       }
233       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
234                          ? 0
235                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
236     }
237 
238     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
239     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
240       if (MRI.isPhysRegUsed(Reg)) {
241         HighestSGPRReg = Reg;
242         break;
243       }
244     }
245 
246     // We found the maximum register index. They start at 0, so add one to get
247     // the number of registers.
248     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
249                        ? 0
250                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
251     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
252                                ? 0
253                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
254 
255     return Info;
256   }
257 
258   int32_t MaxVGPR = -1;
259   int32_t MaxAGPR = -1;
260   int32_t MaxSGPR = -1;
261   uint64_t CalleeFrameSize = 0;
262 
263   for (const MachineBasicBlock &MBB : MF) {
264     for (const MachineInstr &MI : MBB) {
265       // TODO: Check regmasks? Do they occur anywhere except calls?
266       for (const MachineOperand &MO : MI.operands()) {
267         unsigned Width = 0;
268         bool IsSGPR = false;
269         bool IsAGPR = false;
270 
271         if (!MO.isReg())
272           continue;
273 
274         Register Reg = MO.getReg();
275         switch (Reg) {
276         case AMDGPU::EXEC:
277         case AMDGPU::EXEC_LO:
278         case AMDGPU::EXEC_HI:
279         case AMDGPU::SCC:
280         case AMDGPU::M0:
281         case AMDGPU::M0_LO16:
282         case AMDGPU::M0_HI16:
283         case AMDGPU::SRC_SHARED_BASE_LO:
284         case AMDGPU::SRC_SHARED_BASE:
285         case AMDGPU::SRC_SHARED_LIMIT_LO:
286         case AMDGPU::SRC_SHARED_LIMIT:
287         case AMDGPU::SRC_PRIVATE_BASE_LO:
288         case AMDGPU::SRC_PRIVATE_BASE:
289         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
290         case AMDGPU::SRC_PRIVATE_LIMIT:
291         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292         case AMDGPU::SGPR_NULL:
293         case AMDGPU::SGPR_NULL64:
294         case AMDGPU::MODE:
295           continue;
296 
297         case AMDGPU::NoRegister:
298           assert(MI.isDebugInstr() &&
299                  "Instruction uses invalid noreg register");
300           continue;
301 
302         case AMDGPU::VCC:
303         case AMDGPU::VCC_LO:
304         case AMDGPU::VCC_HI:
305         case AMDGPU::VCC_LO_LO16:
306         case AMDGPU::VCC_LO_HI16:
307         case AMDGPU::VCC_HI_LO16:
308         case AMDGPU::VCC_HI_HI16:
309           Info.UsesVCC = true;
310           continue;
311 
312         case AMDGPU::FLAT_SCR:
313         case AMDGPU::FLAT_SCR_LO:
314         case AMDGPU::FLAT_SCR_HI:
315           continue;
316 
317         case AMDGPU::XNACK_MASK:
318         case AMDGPU::XNACK_MASK_LO:
319         case AMDGPU::XNACK_MASK_HI:
320           llvm_unreachable("xnack_mask registers should not be used");
321 
322         case AMDGPU::LDS_DIRECT:
323           llvm_unreachable("lds_direct register should not be used");
324 
325         case AMDGPU::TBA:
326         case AMDGPU::TBA_LO:
327         case AMDGPU::TBA_HI:
328         case AMDGPU::TMA:
329         case AMDGPU::TMA_LO:
330         case AMDGPU::TMA_HI:
331           llvm_unreachable("trap handler registers should not be used");
332 
333         case AMDGPU::SRC_VCCZ:
334           llvm_unreachable("src_vccz register should not be used");
335 
336         case AMDGPU::SRC_EXECZ:
337           llvm_unreachable("src_execz register should not be used");
338 
339         case AMDGPU::SRC_SCC:
340           llvm_unreachable("src_scc register should not be used");
341 
342         default:
343           break;
344         }
345 
346         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
347             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
348             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
349           IsSGPR = true;
350           Width = 1;
351         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
352                    AMDGPU::VGPR_16RegClass.contains(Reg)) {
353           IsSGPR = false;
354           Width = 1;
355         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
356                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
357           IsSGPR = false;
358           IsAGPR = true;
359           Width = 1;
360         } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
361           IsSGPR = true;
362           Width = 2;
363         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
364           IsSGPR = false;
365           Width = 2;
366         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
367           IsSGPR = false;
368           IsAGPR = true;
369           Width = 2;
370         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
371           IsSGPR = false;
372           Width = 3;
373         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
374           IsSGPR = true;
375           Width = 3;
376         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
377           IsSGPR = false;
378           IsAGPR = true;
379           Width = 3;
380         } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
381           IsSGPR = true;
382           Width = 4;
383         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
384           IsSGPR = false;
385           Width = 4;
386         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
387           IsSGPR = false;
388           IsAGPR = true;
389           Width = 4;
390         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
391           IsSGPR = false;
392           Width = 5;
393         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
394           IsSGPR = true;
395           Width = 5;
396         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
397           IsSGPR = false;
398           IsAGPR = true;
399           Width = 5;
400         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
401           IsSGPR = false;
402           Width = 6;
403         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
404           IsSGPR = true;
405           Width = 6;
406         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
407           IsSGPR = false;
408           IsAGPR = true;
409           Width = 6;
410         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
411           IsSGPR = false;
412           Width = 7;
413         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
414           IsSGPR = true;
415           Width = 7;
416         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
417           IsSGPR = false;
418           IsAGPR = true;
419           Width = 7;
420         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
421           IsSGPR = true;
422           Width = 8;
423         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
424           IsSGPR = false;
425           Width = 8;
426         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
427           IsSGPR = false;
428           IsAGPR = true;
429           Width = 8;
430         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
431           IsSGPR = false;
432           Width = 9;
433         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
434           IsSGPR = true;
435           Width = 9;
436         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
437           IsSGPR = false;
438           IsAGPR = true;
439           Width = 9;
440         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
441           IsSGPR = false;
442           Width = 10;
443         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
444           IsSGPR = true;
445           Width = 10;
446         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
447           IsSGPR = false;
448           IsAGPR = true;
449           Width = 10;
450         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
451           IsSGPR = false;
452           Width = 11;
453         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
454           IsSGPR = true;
455           Width = 11;
456         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
457           IsSGPR = false;
458           IsAGPR = true;
459           Width = 11;
460         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
461           IsSGPR = false;
462           Width = 12;
463         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
464           IsSGPR = true;
465           Width = 12;
466         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
467           IsSGPR = false;
468           IsAGPR = true;
469           Width = 12;
470         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
471           IsSGPR = true;
472           Width = 16;
473         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
474           IsSGPR = false;
475           Width = 16;
476         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
477           IsSGPR = false;
478           IsAGPR = true;
479           Width = 16;
480         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
481           IsSGPR = true;
482           Width = 32;
483         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
484           IsSGPR = false;
485           Width = 32;
486         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
487           IsSGPR = false;
488           IsAGPR = true;
489           Width = 32;
490         } else {
491           // We only expect TTMP registers or registers that do not belong to
492           // any RC.
493           assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
494                   AMDGPU::TTMP_64RegClass.contains(Reg) ||
495                   AMDGPU::TTMP_128RegClass.contains(Reg) ||
496                   AMDGPU::TTMP_256RegClass.contains(Reg) ||
497                   AMDGPU::TTMP_512RegClass.contains(Reg) ||
498                   !TRI.getPhysRegBaseClass(Reg)) &&
499                  "Unknown register class");
500         }
501         unsigned HWReg = TRI.getHWRegIndex(Reg);
502         int MaxUsed = HWReg + Width - 1;
503         if (IsSGPR) {
504           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
505         } else if (IsAGPR) {
506           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
507         } else {
508           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
509         }
510       }
511 
512       if (MI.isCall()) {
513         // Pseudo used just to encode the underlying global. Is there a better
514         // way to track this?
515 
516         const MachineOperand *CalleeOp =
517             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
518 
519         const Function *Callee = getCalleeFunction(*CalleeOp);
520         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
521             CallGraphResourceInfo.end();
522 
523         // Avoid crashing on undefined behavior with an illegal call to a
524         // kernel. If a callsite's calling convention doesn't match the
525         // function's, it's undefined behavior. If the callsite calling
526         // convention does match, that would have errored earlier.
527         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
528           report_fatal_error("invalid call to entry function");
529 
530         bool IsIndirect = !Callee || Callee->isDeclaration();
531         if (!IsIndirect)
532           I = CallGraphResourceInfo.find(Callee);
533 
534         // FIXME: Call site could have norecurse on it
535         if (!Callee || !Callee->doesNotRecurse()) {
536           Info.HasRecursion = true;
537 
538           // TODO: If we happen to know there is no stack usage in the
539           // callgraph, we don't need to assume an infinitely growing stack.
540           if (!MI.isReturn()) {
541             // We don't need to assume an unknown stack size for tail calls.
542 
543             // FIXME: This only benefits in the case where the kernel does not
544             // directly call the tail called function. If a kernel directly
545             // calls a tail recursive function, we'll assume maximum stack size
546             // based on the regular call instruction.
547             CalleeFrameSize = std::max(
548                 CalleeFrameSize,
549                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
550           }
551         }
552 
553         if (IsIndirect || I == CallGraphResourceInfo.end()) {
554           CalleeFrameSize =
555               std::max(CalleeFrameSize,
556                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
557 
558           // Register usage of indirect calls gets handled later
559           Info.UsesVCC = true;
560           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
561           Info.HasDynamicallySizedStack = true;
562           Info.HasIndirectCall = true;
563         } else {
564           // We force CodeGen to run in SCC order, so the callee's register
565           // usage etc. should be the cumulative usage of all callees.
566           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
567           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
568           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
569           CalleeFrameSize =
570               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
571           Info.UsesVCC |= I->second.UsesVCC;
572           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
573           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
574           Info.HasRecursion |= I->second.HasRecursion;
575           Info.HasIndirectCall |= I->second.HasIndirectCall;
576         }
577       }
578     }
579   }
580 
581   Info.NumExplicitSGPR = MaxSGPR + 1;
582   Info.NumVGPR = MaxVGPR + 1;
583   Info.NumAGPR = MaxAGPR + 1;
584   Info.PrivateSegmentSize += CalleeFrameSize;
585 
586   return Info;
587 }
588 
589 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
590   // Collect the maximum number of registers from non-hardware-entrypoints.
591   // All these functions are potential targets for indirect calls.
592   int32_t NonKernelMaxSGPRs = 0;
593   int32_t NonKernelMaxVGPRs = 0;
594   int32_t NonKernelMaxAGPRs = 0;
595 
596   for (const auto &I : CallGraphResourceInfo) {
597     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
598       auto &Info = I.getSecond();
599       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
600       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
601       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
602     }
603   }
604 
605   // Add register usage for functions with indirect calls.
606   // For calls to unknown functions, we assume the maximum register usage of
607   // all non-hardware-entrypoints in the current module.
608   for (auto &I : CallGraphResourceInfo) {
609     auto &Info = I.getSecond();
610     if (Info.HasIndirectCall) {
611       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
612       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
613       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
614     }
615   }
616 }
617