xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision 8f7ed58a15556bf567ff876e1999e4fe4d684e1d)
1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/Analysis/CallGraph.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/TargetPassConfig.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/Target/TargetMachine.h"
37 
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40 
41 #define DEBUG_TYPE "amdgpu-resource-usage"
42 
43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45 
46 // In code object v4 and older, we need to tell the runtime some amount ahead of
47 // time if we don't know the true stack size. Assume a smaller number if this is
48 // only due to dynamic / non-entry block allocas.
49 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
50     "amdgpu-assume-external-call-stack-size",
51     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52     cl::init(16384));
53 
54 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
55     "amdgpu-assume-dynamic-stack-object-size",
56     cl::desc("Assumed extra stack use if there are any "
57              "variable sized objects (in bytes)"),
58     cl::Hidden, cl::init(4096));
59 
60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61                 "Function register usage analysis", true, true)
62 
63 static const Function *getCalleeFunction(const MachineOperand &Op) {
64   if (Op.isImm()) {
65     assert(Op.getImm() == 0);
66     return nullptr;
67   }
68   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69     return cast<Function>(GA->getOperand(0));
70   return cast<Function>(Op.getGlobal());
71 }
72 
73 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
74                                   const SIInstrInfo &TII, unsigned Reg) {
75   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
77       return true;
78   }
79 
80   return false;
81 }
82 
83 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
84     const GCNSubtarget &ST) const {
85   return NumExplicitSGPR +
86          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87                                    ST.getTargetID().isXnackOnOrAny());
88 }
89 
90 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93 }
94 
95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96     const GCNSubtarget &ST) const {
97   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99 
100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102   if (!TPC)
103     return false;
104 
105   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106   const TargetMachine &TM = TPC->getTM<TargetMachine>();
107   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
108   bool HasIndirectCall = false;
109 
110   CallGraph CG = CallGraph(M);
111   auto End = po_end(&CG);
112 
113   // By default, for code object v5 and later, track only the minimum scratch
114   // size
115   if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
116       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
117     if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
118       AssumedStackSizeForDynamicSizeObjects = 0;
119     if (!AssumedStackSizeForExternalCall.getNumOccurrences())
120       AssumedStackSizeForExternalCall = 0;
121   }
122 
123   for (auto IT = po_begin(&CG); IT != End; ++IT) {
124     Function *F = IT->getFunction();
125     if (!F || F->isDeclaration())
126       continue;
127 
128     MachineFunction *MF = MMI.getMachineFunction(*F);
129     assert(MF && "function must have been generated already");
130 
131     auto CI =
132         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
133     SIFunctionResourceInfo &Info = CI.first->second;
134     assert(CI.second && "should only be called once per function");
135     Info = analyzeResourceUsage(*MF, TM);
136     HasIndirectCall |= Info.HasIndirectCall;
137   }
138 
139   // It's possible we have unreachable functions in the module which weren't
140   // visited by the PO traversal. Make sure we have some resource counts to
141   // report.
142   for (const auto &IT : CG) {
143     const Function *F = IT.first;
144     if (!F || F->isDeclaration())
145       continue;
146 
147     auto CI =
148         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
149     if (!CI.second) // Skip already visited functions
150       continue;
151 
152     SIFunctionResourceInfo &Info = CI.first->second;
153     MachineFunction *MF = MMI.getMachineFunction(*F);
154     assert(MF && "function must have been generated already");
155     Info = analyzeResourceUsage(*MF, TM);
156     HasIndirectCall |= Info.HasIndirectCall;
157   }
158 
159   if (HasIndirectCall)
160     propagateIndirectCallRegisterUsage();
161 
162   return false;
163 }
164 
165 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
166 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
167     const MachineFunction &MF, const TargetMachine &TM) const {
168   SIFunctionResourceInfo Info;
169 
170   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
171   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
172   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
173   const MachineRegisterInfo &MRI = MF.getRegInfo();
174   const SIInstrInfo *TII = ST.getInstrInfo();
175   const SIRegisterInfo &TRI = TII->getRegisterInfo();
176 
177   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
178                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
179                          MRI.isLiveIn(MFI->getPreloadedReg(
180                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
181 
182   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
183   // instructions aren't used to access the scratch buffer. Inline assembly may
184   // need it though.
185   //
186   // If we only have implicit uses of flat_scr on flat instructions, it is not
187   // really needed.
188   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
189       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
190        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
191        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
192     Info.UsesFlatScratch = false;
193   }
194 
195   Info.PrivateSegmentSize = FrameInfo.getStackSize();
196 
197   // Assume a big number if there are any unknown sized objects.
198   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
199   if (Info.HasDynamicallySizedStack)
200     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
201 
202   if (MFI->isStackRealigned())
203     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
204 
205   Info.UsesVCC =
206       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
207 
208   // If there are no calls, MachineRegisterInfo can tell us the used register
209   // count easily.
210   // A tail call isn't considered a call for MachineFrameInfo's purposes.
211   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
212     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
213     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
214       if (MRI.isPhysRegUsed(Reg)) {
215         HighestVGPRReg = Reg;
216         break;
217       }
218     }
219 
220     if (ST.hasMAIInsts()) {
221       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
222       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
223         if (MRI.isPhysRegUsed(Reg)) {
224           HighestAGPRReg = Reg;
225           break;
226         }
227       }
228       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
229                          ? 0
230                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
231     }
232 
233     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
234     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
235       if (MRI.isPhysRegUsed(Reg)) {
236         HighestSGPRReg = Reg;
237         break;
238       }
239     }
240 
241     // We found the maximum register index. They start at 0, so add one to get
242     // the number of registers.
243     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
244                        ? 0
245                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
246     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
247                                ? 0
248                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
249 
250     return Info;
251   }
252 
253   int32_t MaxVGPR = -1;
254   int32_t MaxAGPR = -1;
255   int32_t MaxSGPR = -1;
256   uint64_t CalleeFrameSize = 0;
257 
258   for (const MachineBasicBlock &MBB : MF) {
259     for (const MachineInstr &MI : MBB) {
260       // TODO: Check regmasks? Do they occur anywhere except calls?
261       for (const MachineOperand &MO : MI.operands()) {
262         unsigned Width = 0;
263         bool IsSGPR = false;
264         bool IsAGPR = false;
265 
266         if (!MO.isReg())
267           continue;
268 
269         Register Reg = MO.getReg();
270         switch (Reg) {
271         case AMDGPU::EXEC:
272         case AMDGPU::EXEC_LO:
273         case AMDGPU::EXEC_HI:
274         case AMDGPU::SCC:
275         case AMDGPU::M0:
276         case AMDGPU::M0_LO16:
277         case AMDGPU::M0_HI16:
278         case AMDGPU::SRC_SHARED_BASE_LO:
279         case AMDGPU::SRC_SHARED_BASE:
280         case AMDGPU::SRC_SHARED_LIMIT_LO:
281         case AMDGPU::SRC_SHARED_LIMIT:
282         case AMDGPU::SRC_PRIVATE_BASE_LO:
283         case AMDGPU::SRC_PRIVATE_BASE:
284         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
285         case AMDGPU::SRC_PRIVATE_LIMIT:
286         case AMDGPU::SGPR_NULL:
287         case AMDGPU::SGPR_NULL64:
288         case AMDGPU::MODE:
289           continue;
290 
291         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292           llvm_unreachable("src_pops_exiting_wave_id should not be used");
293 
294         case AMDGPU::NoRegister:
295           assert(MI.isDebugInstr() &&
296                  "Instruction uses invalid noreg register");
297           continue;
298 
299         case AMDGPU::VCC:
300         case AMDGPU::VCC_LO:
301         case AMDGPU::VCC_HI:
302         case AMDGPU::VCC_LO_LO16:
303         case AMDGPU::VCC_LO_HI16:
304         case AMDGPU::VCC_HI_LO16:
305         case AMDGPU::VCC_HI_HI16:
306           Info.UsesVCC = true;
307           continue;
308 
309         case AMDGPU::FLAT_SCR:
310         case AMDGPU::FLAT_SCR_LO:
311         case AMDGPU::FLAT_SCR_HI:
312           continue;
313 
314         case AMDGPU::XNACK_MASK:
315         case AMDGPU::XNACK_MASK_LO:
316         case AMDGPU::XNACK_MASK_HI:
317           llvm_unreachable("xnack_mask registers should not be used");
318 
319         case AMDGPU::LDS_DIRECT:
320           llvm_unreachable("lds_direct register should not be used");
321 
322         case AMDGPU::TBA:
323         case AMDGPU::TBA_LO:
324         case AMDGPU::TBA_HI:
325         case AMDGPU::TMA:
326         case AMDGPU::TMA_LO:
327         case AMDGPU::TMA_HI:
328           llvm_unreachable("trap handler registers should not be used");
329 
330         case AMDGPU::SRC_VCCZ:
331           llvm_unreachable("src_vccz register should not be used");
332 
333         case AMDGPU::SRC_EXECZ:
334           llvm_unreachable("src_execz register should not be used");
335 
336         case AMDGPU::SRC_SCC:
337           llvm_unreachable("src_scc register should not be used");
338 
339         default:
340           break;
341         }
342 
343         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
344             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
345             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
346           IsSGPR = true;
347           Width = 1;
348         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
349                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
350                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
351           IsSGPR = false;
352           Width = 1;
353         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
354                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
355           IsSGPR = false;
356           IsAGPR = true;
357           Width = 1;
358         } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
359           IsSGPR = true;
360           Width = 2;
361         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
362           IsSGPR = false;
363           Width = 2;
364         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
365           IsSGPR = false;
366           IsAGPR = true;
367           Width = 2;
368         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
369           IsSGPR = false;
370           Width = 3;
371         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
372           IsSGPR = true;
373           Width = 3;
374         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
375           IsSGPR = false;
376           IsAGPR = true;
377           Width = 3;
378         } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
379           IsSGPR = true;
380           Width = 4;
381         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
382           IsSGPR = false;
383           Width = 4;
384         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
385           IsSGPR = false;
386           IsAGPR = true;
387           Width = 4;
388         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
389           IsSGPR = false;
390           Width = 5;
391         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
392           IsSGPR = true;
393           Width = 5;
394         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
395           IsSGPR = false;
396           IsAGPR = true;
397           Width = 5;
398         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
399           IsSGPR = false;
400           Width = 6;
401         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
402           IsSGPR = true;
403           Width = 6;
404         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
405           IsSGPR = false;
406           IsAGPR = true;
407           Width = 6;
408         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
409           IsSGPR = false;
410           Width = 7;
411         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
412           IsSGPR = true;
413           Width = 7;
414         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
415           IsSGPR = false;
416           IsAGPR = true;
417           Width = 7;
418         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
419           IsSGPR = true;
420           Width = 8;
421         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
422           IsSGPR = false;
423           Width = 8;
424         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
425           IsSGPR = false;
426           IsAGPR = true;
427           Width = 8;
428         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
429           IsSGPR = false;
430           Width = 9;
431         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
432           IsSGPR = true;
433           Width = 9;
434         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
435           IsSGPR = false;
436           IsAGPR = true;
437           Width = 9;
438         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
439           IsSGPR = false;
440           Width = 10;
441         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
442           IsSGPR = true;
443           Width = 10;
444         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
445           IsSGPR = false;
446           IsAGPR = true;
447           Width = 10;
448         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
449           IsSGPR = false;
450           Width = 11;
451         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
452           IsSGPR = true;
453           Width = 11;
454         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
455           IsSGPR = false;
456           IsAGPR = true;
457           Width = 11;
458         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
459           IsSGPR = false;
460           Width = 12;
461         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
462           IsSGPR = true;
463           Width = 12;
464         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
465           IsSGPR = false;
466           IsAGPR = true;
467           Width = 12;
468         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
469           IsSGPR = true;
470           Width = 16;
471         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
472           IsSGPR = false;
473           Width = 16;
474         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
475           IsSGPR = false;
476           IsAGPR = true;
477           Width = 16;
478         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
479           IsSGPR = true;
480           Width = 32;
481         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
482           IsSGPR = false;
483           Width = 32;
484         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
485           IsSGPR = false;
486           IsAGPR = true;
487           Width = 32;
488         } else {
489           // We only expect TTMP registers or registers that do not belong to
490           // any RC.
491           assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
492                   AMDGPU::TTMP_64RegClass.contains(Reg) ||
493                   AMDGPU::TTMP_128RegClass.contains(Reg) ||
494                   AMDGPU::TTMP_256RegClass.contains(Reg) ||
495                   AMDGPU::TTMP_512RegClass.contains(Reg) ||
496                   !TRI.getPhysRegBaseClass(Reg)) &&
497                  "Unknown register class");
498         }
499         unsigned HWReg = TRI.getHWRegIndex(Reg);
500         int MaxUsed = HWReg + Width - 1;
501         if (IsSGPR) {
502           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
503         } else if (IsAGPR) {
504           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
505         } else {
506           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
507         }
508       }
509 
510       if (MI.isCall()) {
511         // Pseudo used just to encode the underlying global. Is there a better
512         // way to track this?
513 
514         const MachineOperand *CalleeOp =
515             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
516 
517         const Function *Callee = getCalleeFunction(*CalleeOp);
518         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
519             CallGraphResourceInfo.end();
520 
521         // Avoid crashing on undefined behavior with an illegal call to a
522         // kernel. If a callsite's calling convention doesn't match the
523         // function's, it's undefined behavior. If the callsite calling
524         // convention does match, that would have errored earlier.
525         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
526           report_fatal_error("invalid call to entry function");
527 
528         bool IsIndirect = !Callee || Callee->isDeclaration();
529         if (!IsIndirect)
530           I = CallGraphResourceInfo.find(Callee);
531 
532         // FIXME: Call site could have norecurse on it
533         if (!Callee || !Callee->doesNotRecurse()) {
534           Info.HasRecursion = true;
535 
536           // TODO: If we happen to know there is no stack usage in the
537           // callgraph, we don't need to assume an infinitely growing stack.
538           if (!MI.isReturn()) {
539             // We don't need to assume an unknown stack size for tail calls.
540 
541             // FIXME: This only benefits in the case where the kernel does not
542             // directly call the tail called function. If a kernel directly
543             // calls a tail recursive function, we'll assume maximum stack size
544             // based on the regular call instruction.
545             CalleeFrameSize =
546               std::max(CalleeFrameSize,
547                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
548           }
549         }
550 
551         if (IsIndirect || I == CallGraphResourceInfo.end()) {
552           CalleeFrameSize =
553               std::max(CalleeFrameSize,
554                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
555 
556           // Register usage of indirect calls gets handled later
557           Info.UsesVCC = true;
558           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
559           Info.HasDynamicallySizedStack = true;
560           Info.HasIndirectCall = true;
561         } else {
562           // We force CodeGen to run in SCC order, so the callee's register
563           // usage etc. should be the cumulative usage of all callees.
564           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
565           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
566           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
567           CalleeFrameSize =
568               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
569           Info.UsesVCC |= I->second.UsesVCC;
570           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
571           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
572           Info.HasRecursion |= I->second.HasRecursion;
573           Info.HasIndirectCall |= I->second.HasIndirectCall;
574         }
575       }
576     }
577   }
578 
579   Info.NumExplicitSGPR = MaxSGPR + 1;
580   Info.NumVGPR = MaxVGPR + 1;
581   Info.NumAGPR = MaxAGPR + 1;
582   Info.PrivateSegmentSize += CalleeFrameSize;
583 
584   return Info;
585 }
586 
587 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
588   // Collect the maximum number of registers from non-hardware-entrypoints.
589   // All these functions are potential targets for indirect calls.
590   int32_t NonKernelMaxSGPRs = 0;
591   int32_t NonKernelMaxVGPRs = 0;
592   int32_t NonKernelMaxAGPRs = 0;
593 
594   for (const auto &I : CallGraphResourceInfo) {
595     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
596       auto &Info = I.getSecond();
597       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
598       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
599       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
600     }
601   }
602 
603   // Add register usage for functions with indirect calls.
604   // For calls to unknown functions, we assume the maximum register usage of
605   // all non-hardware-entrypoints in the current module.
606   for (auto &I : CallGraphResourceInfo) {
607     auto &Info = I.getSecond();
608     if (Info.HasIndirectCall) {
609       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
610       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
611       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
612     }
613   }
614 }
615