xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPUResourceUsageAnalysis.h"
19 #include "AMDGPU.h"
20 #include "GCNSubtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineModuleInfo.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Target/TargetMachine.h"
27 
28 using namespace llvm;
29 using namespace llvm::AMDGPU;
30 
31 #define DEBUG_TYPE "amdgpu-resource-usage"
32 
33 char llvm::AMDGPUResourceUsageAnalysisWrapperPass::ID = 0;
34 char &llvm::AMDGPUResourceUsageAnalysisID =
35     AMDGPUResourceUsageAnalysisWrapperPass::ID;
36 
37 // In code object v4 and older, we need to tell the runtime some amount ahead of
38 // time if we don't know the true stack size. Assume a smaller number if this is
39 // only due to dynamic / non-entry block allocas.
40 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
41     "amdgpu-assume-external-call-stack-size",
42     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
43     cl::init(16384));
44 
45 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
46     "amdgpu-assume-dynamic-stack-object-size",
47     cl::desc("Assumed extra stack use if there are any "
48              "variable sized objects (in bytes)"),
49     cl::Hidden, cl::init(4096));
50 
51 INITIALIZE_PASS(AMDGPUResourceUsageAnalysisWrapperPass, DEBUG_TYPE,
52                 "Function register usage analysis", true, true)
53 
54 static const Function *getCalleeFunction(const MachineOperand &Op) {
55   if (Op.isImm()) {
56     assert(Op.getImm() == 0);
57     return nullptr;
58   }
59   return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
60 }
61 
62 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
63                                   const SIInstrInfo &TII, unsigned Reg) {
64   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
65     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
66       return true;
67   }
68 
69   return false;
70 }
71 
72 bool AMDGPUResourceUsageAnalysisWrapperPass::runOnMachineFunction(
73     MachineFunction &MF) {
74   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
75   if (!TPC)
76     return false;
77 
78   const TargetMachine &TM = TPC->getTM<TargetMachine>();
79   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
80 
81   // By default, for code object v5 and later, track only the minimum scratch
82   // size
83   uint32_t AssumedStackSizeForDynamicSizeObjects =
84       clAssumedStackSizeForDynamicSizeObjects;
85   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
86   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
87           AMDGPU::AMDHSA_COV5 ||
88       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
89     if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
90       AssumedStackSizeForDynamicSizeObjects = 0;
91     if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
92       AssumedStackSizeForExternalCall = 0;
93   }
94 
95   ResourceInfo = AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
96       MF, AssumedStackSizeForDynamicSizeObjects,
97       AssumedStackSizeForExternalCall);
98 
99   return false;
100 }
101 
102 AnalysisKey AMDGPUResourceUsageAnalysis::Key;
103 AMDGPUResourceUsageAnalysis::Result
104 AMDGPUResourceUsageAnalysis::run(MachineFunction &MF,
105                                  MachineFunctionAnalysisManager &MFAM) {
106   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
107 
108   // By default, for code object v5 and later, track only the minimum scratch
109   // size
110   uint32_t AssumedStackSizeForDynamicSizeObjects =
111       clAssumedStackSizeForDynamicSizeObjects;
112   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
113   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
114           AMDGPU::AMDHSA_COV5 ||
115       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
116     if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
117       AssumedStackSizeForDynamicSizeObjects = 0;
118     if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
119       AssumedStackSizeForExternalCall = 0;
120   }
121 
122   return AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
123       MF, AssumedStackSizeForDynamicSizeObjects,
124       AssumedStackSizeForExternalCall);
125 }
126 
127 AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo
128 AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
129     const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
130     uint32_t AssumedStackSizeForExternalCall) const {
131   SIFunctionResourceInfo Info;
132 
133   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
134   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
135   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
136   const MachineRegisterInfo &MRI = MF.getRegInfo();
137   const SIInstrInfo *TII = ST.getInstrInfo();
138   const SIRegisterInfo &TRI = TII->getRegisterInfo();
139 
140   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
141                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
142                          MRI.isLiveIn(MFI->getPreloadedReg(
143                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
144 
145   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
146   // instructions aren't used to access the scratch buffer. Inline assembly may
147   // need it though.
148   //
149   // If we only have implicit uses of flat_scr on flat instructions, it is not
150   // really needed.
151   if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
152       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
153        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
154        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
155     Info.UsesFlatScratch = false;
156   }
157 
158   Info.PrivateSegmentSize = FrameInfo.getStackSize();
159 
160   // Assume a big number if there are any unknown sized objects.
161   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
162   if (Info.HasDynamicallySizedStack)
163     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
164 
165   if (MFI->isStackRealigned())
166     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
167 
168   Info.UsesVCC =
169       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
170 
171   // If there are no calls, MachineRegisterInfo can tell us the used register
172   // count easily.
173   // A tail call isn't considered a call for MachineFrameInfo's purposes.
174   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
175     Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
176     Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
177     if (ST.hasMAIInsts())
178       Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
179     return Info;
180   }
181 
182   int32_t MaxVGPR = -1;
183   int32_t MaxAGPR = -1;
184   int32_t MaxSGPR = -1;
185   Info.CalleeSegmentSize = 0;
186 
187   for (const MachineBasicBlock &MBB : MF) {
188     for (const MachineInstr &MI : MBB) {
189       // TODO: Check regmasks? Do they occur anywhere except calls?
190       for (const MachineOperand &MO : MI.operands()) {
191         unsigned Width = 0;
192         bool IsSGPR = false;
193         bool IsAGPR = false;
194 
195         if (!MO.isReg())
196           continue;
197 
198         Register Reg = MO.getReg();
199         switch (Reg) {
200         case AMDGPU::EXEC:
201         case AMDGPU::EXEC_LO:
202         case AMDGPU::EXEC_HI:
203         case AMDGPU::SCC:
204         case AMDGPU::M0:
205         case AMDGPU::M0_LO16:
206         case AMDGPU::M0_HI16:
207         case AMDGPU::SRC_SHARED_BASE_LO:
208         case AMDGPU::SRC_SHARED_BASE:
209         case AMDGPU::SRC_SHARED_LIMIT_LO:
210         case AMDGPU::SRC_SHARED_LIMIT:
211         case AMDGPU::SRC_PRIVATE_BASE_LO:
212         case AMDGPU::SRC_PRIVATE_BASE:
213         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
214         case AMDGPU::SRC_PRIVATE_LIMIT:
215         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
216         case AMDGPU::SGPR_NULL:
217         case AMDGPU::SGPR_NULL64:
218         case AMDGPU::MODE:
219           continue;
220 
221         case AMDGPU::NoRegister:
222           assert(MI.isDebugInstr() &&
223                  "Instruction uses invalid noreg register");
224           continue;
225 
226         case AMDGPU::VCC:
227         case AMDGPU::VCC_LO:
228         case AMDGPU::VCC_HI:
229         case AMDGPU::VCC_LO_LO16:
230         case AMDGPU::VCC_LO_HI16:
231         case AMDGPU::VCC_HI_LO16:
232         case AMDGPU::VCC_HI_HI16:
233           Info.UsesVCC = true;
234           continue;
235 
236         case AMDGPU::FLAT_SCR:
237         case AMDGPU::FLAT_SCR_LO:
238         case AMDGPU::FLAT_SCR_HI:
239           continue;
240 
241         case AMDGPU::XNACK_MASK:
242         case AMDGPU::XNACK_MASK_LO:
243         case AMDGPU::XNACK_MASK_HI:
244           llvm_unreachable("xnack_mask registers should not be used");
245 
246         case AMDGPU::LDS_DIRECT:
247           llvm_unreachable("lds_direct register should not be used");
248 
249         case AMDGPU::TBA:
250         case AMDGPU::TBA_LO:
251         case AMDGPU::TBA_HI:
252         case AMDGPU::TMA:
253         case AMDGPU::TMA_LO:
254         case AMDGPU::TMA_HI:
255           llvm_unreachable("trap handler registers should not be used");
256 
257         case AMDGPU::SRC_VCCZ:
258           llvm_unreachable("src_vccz register should not be used");
259 
260         case AMDGPU::SRC_EXECZ:
261           llvm_unreachable("src_execz register should not be used");
262 
263         case AMDGPU::SRC_SCC:
264           llvm_unreachable("src_scc register should not be used");
265 
266         default:
267           break;
268         }
269 
270         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
271             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
272             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
273           IsSGPR = true;
274           Width = 1;
275         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
276                    AMDGPU::VGPR_16RegClass.contains(Reg)) {
277           IsSGPR = false;
278           Width = 1;
279         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
280                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
281           IsSGPR = false;
282           IsAGPR = true;
283           Width = 1;
284         } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
285           IsSGPR = true;
286           Width = 2;
287         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
288           IsSGPR = false;
289           Width = 2;
290         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
291           IsSGPR = false;
292           IsAGPR = true;
293           Width = 2;
294         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
295           IsSGPR = false;
296           Width = 3;
297         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
298           IsSGPR = true;
299           Width = 3;
300         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
301           IsSGPR = false;
302           IsAGPR = true;
303           Width = 3;
304         } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
305           IsSGPR = true;
306           Width = 4;
307         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
308           IsSGPR = false;
309           Width = 4;
310         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
311           IsSGPR = false;
312           IsAGPR = true;
313           Width = 4;
314         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
315           IsSGPR = false;
316           Width = 5;
317         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
318           IsSGPR = true;
319           Width = 5;
320         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
321           IsSGPR = false;
322           IsAGPR = true;
323           Width = 5;
324         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
325           IsSGPR = false;
326           Width = 6;
327         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
328           IsSGPR = true;
329           Width = 6;
330         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
331           IsSGPR = false;
332           IsAGPR = true;
333           Width = 6;
334         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
335           IsSGPR = false;
336           Width = 7;
337         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
338           IsSGPR = true;
339           Width = 7;
340         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
341           IsSGPR = false;
342           IsAGPR = true;
343           Width = 7;
344         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
345           IsSGPR = true;
346           Width = 8;
347         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
348           IsSGPR = false;
349           Width = 8;
350         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
351           IsSGPR = false;
352           IsAGPR = true;
353           Width = 8;
354         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
355           IsSGPR = false;
356           Width = 9;
357         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
358           IsSGPR = true;
359           Width = 9;
360         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
361           IsSGPR = false;
362           IsAGPR = true;
363           Width = 9;
364         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
365           IsSGPR = false;
366           Width = 10;
367         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
368           IsSGPR = true;
369           Width = 10;
370         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
371           IsSGPR = false;
372           IsAGPR = true;
373           Width = 10;
374         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
375           IsSGPR = false;
376           Width = 11;
377         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
378           IsSGPR = true;
379           Width = 11;
380         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
381           IsSGPR = false;
382           IsAGPR = true;
383           Width = 11;
384         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
385           IsSGPR = false;
386           Width = 12;
387         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
388           IsSGPR = true;
389           Width = 12;
390         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
391           IsSGPR = false;
392           IsAGPR = true;
393           Width = 12;
394         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
395           IsSGPR = true;
396           Width = 16;
397         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
398           IsSGPR = false;
399           Width = 16;
400         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
401           IsSGPR = false;
402           IsAGPR = true;
403           Width = 16;
404         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
405           IsSGPR = true;
406           Width = 32;
407         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
408           IsSGPR = false;
409           Width = 32;
410         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
411           IsSGPR = false;
412           IsAGPR = true;
413           Width = 32;
414         } else {
415           // We only expect TTMP registers or registers that do not belong to
416           // any RC.
417           assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
418                   AMDGPU::TTMP_64RegClass.contains(Reg) ||
419                   AMDGPU::TTMP_128RegClass.contains(Reg) ||
420                   AMDGPU::TTMP_256RegClass.contains(Reg) ||
421                   AMDGPU::TTMP_512RegClass.contains(Reg) ||
422                   !TRI.getPhysRegBaseClass(Reg)) &&
423                  "Unknown register class");
424         }
425         unsigned HWReg = TRI.getHWRegIndex(Reg);
426         int MaxUsed = HWReg + Width - 1;
427         if (IsSGPR) {
428           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
429         } else if (IsAGPR) {
430           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
431         } else {
432           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
433         }
434       }
435 
436       if (MI.isCall()) {
437         // Pseudo used just to encode the underlying global. Is there a better
438         // way to track this?
439 
440         const MachineOperand *CalleeOp =
441             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
442 
443         const Function *Callee = getCalleeFunction(*CalleeOp);
444 
445         // Avoid crashing on undefined behavior with an illegal call to a
446         // kernel. If a callsite's calling convention doesn't match the
447         // function's, it's undefined behavior. If the callsite calling
448         // convention does match, that would have errored earlier.
449         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
450           report_fatal_error("invalid call to entry function");
451 
452         auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
453           return F == &MF.getFunction();
454         };
455 
456         if (Callee && !isSameFunction(MF, Callee))
457           Info.Callees.push_back(Callee);
458 
459         bool IsIndirect = !Callee || Callee->isDeclaration();
460 
461         // FIXME: Call site could have norecurse on it
462         if (!Callee || !Callee->doesNotRecurse()) {
463           Info.HasRecursion = true;
464 
465           // TODO: If we happen to know there is no stack usage in the
466           // callgraph, we don't need to assume an infinitely growing stack.
467           if (!MI.isReturn()) {
468             // We don't need to assume an unknown stack size for tail calls.
469 
470             // FIXME: This only benefits in the case where the kernel does not
471             // directly call the tail called function. If a kernel directly
472             // calls a tail recursive function, we'll assume maximum stack size
473             // based on the regular call instruction.
474             Info.CalleeSegmentSize = std::max(
475                 Info.CalleeSegmentSize,
476                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
477           }
478         }
479 
480         if (IsIndirect) {
481           Info.CalleeSegmentSize =
482               std::max(Info.CalleeSegmentSize,
483                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
484 
485           // Register usage of indirect calls gets handled later
486           Info.UsesVCC = true;
487           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
488           Info.HasDynamicallySizedStack = true;
489           Info.HasIndirectCall = true;
490         }
491       }
492     }
493   }
494 
495   Info.NumExplicitSGPR = MaxSGPR + 1;
496   Info.NumVGPR = MaxVGPR + 1;
497   Info.NumAGPR = MaxAGPR + 1;
498 
499   return Info;
500 }
501