1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 //===----------------------------------------------------------------------===//
17
18 #include "AMDGPUResourceUsageAnalysis.h"
19 #include "AMDGPU.h"
20 #include "GCNSubtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineModuleInfo.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Target/TargetMachine.h"
27
28 using namespace llvm;
29 using namespace llvm::AMDGPU;
30
31 #define DEBUG_TYPE "amdgpu-resource-usage"
32
33 char llvm::AMDGPUResourceUsageAnalysisWrapperPass::ID = 0;
34 char &llvm::AMDGPUResourceUsageAnalysisID =
35 AMDGPUResourceUsageAnalysisWrapperPass::ID;
36
37 // In code object v4 and older, we need to tell the runtime some amount ahead of
38 // time if we don't know the true stack size. Assume a smaller number if this is
39 // only due to dynamic / non-entry block allocas.
40 static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
41 "amdgpu-assume-external-call-stack-size",
42 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
43 cl::init(16384));
44
45 static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
46 "amdgpu-assume-dynamic-stack-object-size",
47 cl::desc("Assumed extra stack use if there are any "
48 "variable sized objects (in bytes)"),
49 cl::Hidden, cl::init(4096));
50
51 INITIALIZE_PASS(AMDGPUResourceUsageAnalysisWrapperPass, DEBUG_TYPE,
52 "Function register usage analysis", true, true)
53
getCalleeFunction(const MachineOperand & Op)54 static const Function *getCalleeFunction(const MachineOperand &Op) {
55 if (Op.isImm()) {
56 assert(Op.getImm() == 0);
57 return nullptr;
58 }
59 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
60 }
61
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)62 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
63 const SIInstrInfo &TII, unsigned Reg) {
64 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
65 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
66 return true;
67 }
68
69 return false;
70 }
71
runOnMachineFunction(MachineFunction & MF)72 bool AMDGPUResourceUsageAnalysisWrapperPass::runOnMachineFunction(
73 MachineFunction &MF) {
74 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
75 if (!TPC)
76 return false;
77
78 const TargetMachine &TM = TPC->getTM<TargetMachine>();
79 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
80
81 // By default, for code object v5 and later, track only the minimum scratch
82 // size
83 uint32_t AssumedStackSizeForDynamicSizeObjects =
84 clAssumedStackSizeForDynamicSizeObjects;
85 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
86 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
87 AMDGPU::AMDHSA_COV5 ||
88 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
89 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
90 AssumedStackSizeForDynamicSizeObjects = 0;
91 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
92 AssumedStackSizeForExternalCall = 0;
93 }
94
95 ResourceInfo = AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
96 MF, AssumedStackSizeForDynamicSizeObjects,
97 AssumedStackSizeForExternalCall);
98
99 return false;
100 }
101
102 AnalysisKey AMDGPUResourceUsageAnalysis::Key;
103 AMDGPUResourceUsageAnalysis::Result
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)104 AMDGPUResourceUsageAnalysis::run(MachineFunction &MF,
105 MachineFunctionAnalysisManager &MFAM) {
106 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
107
108 // By default, for code object v5 and later, track only the minimum scratch
109 // size
110 uint32_t AssumedStackSizeForDynamicSizeObjects =
111 clAssumedStackSizeForDynamicSizeObjects;
112 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
113 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
114 AMDGPU::AMDHSA_COV5 ||
115 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
116 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
117 AssumedStackSizeForDynamicSizeObjects = 0;
118 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
119 AssumedStackSizeForExternalCall = 0;
120 }
121
122 return AMDGPUResourceUsageAnalysisImpl().analyzeResourceUsage(
123 MF, AssumedStackSizeForDynamicSizeObjects,
124 AssumedStackSizeForExternalCall);
125 }
126
127 AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction & MF,uint32_t AssumedStackSizeForDynamicSizeObjects,uint32_t AssumedStackSizeForExternalCall) const128 AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
129 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
130 uint32_t AssumedStackSizeForExternalCall) const {
131 SIFunctionResourceInfo Info;
132
133 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
134 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
135 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
136 const MachineRegisterInfo &MRI = MF.getRegInfo();
137 const SIInstrInfo *TII = ST.getInstrInfo();
138 const SIRegisterInfo &TRI = TII->getRegisterInfo();
139
140 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
141 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
142 MRI.isLiveIn(MFI->getPreloadedReg(
143 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
144
145 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
146 // instructions aren't used to access the scratch buffer. Inline assembly may
147 // need it though.
148 //
149 // If we only have implicit uses of flat_scr on flat instructions, it is not
150 // really needed.
151 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
152 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
153 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
154 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
155 Info.UsesFlatScratch = false;
156 }
157
158 Info.PrivateSegmentSize = FrameInfo.getStackSize();
159
160 // Assume a big number if there are any unknown sized objects.
161 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
162 if (Info.HasDynamicallySizedStack)
163 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
164
165 if (MFI->isStackRealigned())
166 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
167
168 Info.UsesVCC =
169 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
170
171 // If there are no calls, MachineRegisterInfo can tell us the used register
172 // count easily.
173 // A tail call isn't considered a call for MachineFrameInfo's purposes.
174 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
175 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
176 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
177 if (ST.hasMAIInsts())
178 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
179 return Info;
180 }
181
182 int32_t MaxVGPR = -1;
183 int32_t MaxAGPR = -1;
184 int32_t MaxSGPR = -1;
185 Info.CalleeSegmentSize = 0;
186
187 for (const MachineBasicBlock &MBB : MF) {
188 for (const MachineInstr &MI : MBB) {
189 // TODO: Check regmasks? Do they occur anywhere except calls?
190 for (const MachineOperand &MO : MI.operands()) {
191 unsigned Width = 0;
192 bool IsSGPR = false;
193 bool IsAGPR = false;
194
195 if (!MO.isReg())
196 continue;
197
198 Register Reg = MO.getReg();
199 switch (Reg) {
200 case AMDGPU::EXEC:
201 case AMDGPU::EXEC_LO:
202 case AMDGPU::EXEC_HI:
203 case AMDGPU::SCC:
204 case AMDGPU::M0:
205 case AMDGPU::M0_LO16:
206 case AMDGPU::M0_HI16:
207 case AMDGPU::SRC_SHARED_BASE_LO:
208 case AMDGPU::SRC_SHARED_BASE:
209 case AMDGPU::SRC_SHARED_LIMIT_LO:
210 case AMDGPU::SRC_SHARED_LIMIT:
211 case AMDGPU::SRC_PRIVATE_BASE_LO:
212 case AMDGPU::SRC_PRIVATE_BASE:
213 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
214 case AMDGPU::SRC_PRIVATE_LIMIT:
215 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
216 case AMDGPU::SGPR_NULL:
217 case AMDGPU::SGPR_NULL64:
218 case AMDGPU::MODE:
219 continue;
220
221 case AMDGPU::NoRegister:
222 assert(MI.isDebugInstr() &&
223 "Instruction uses invalid noreg register");
224 continue;
225
226 case AMDGPU::VCC:
227 case AMDGPU::VCC_LO:
228 case AMDGPU::VCC_HI:
229 case AMDGPU::VCC_LO_LO16:
230 case AMDGPU::VCC_LO_HI16:
231 case AMDGPU::VCC_HI_LO16:
232 case AMDGPU::VCC_HI_HI16:
233 Info.UsesVCC = true;
234 continue;
235
236 case AMDGPU::FLAT_SCR:
237 case AMDGPU::FLAT_SCR_LO:
238 case AMDGPU::FLAT_SCR_HI:
239 continue;
240
241 case AMDGPU::XNACK_MASK:
242 case AMDGPU::XNACK_MASK_LO:
243 case AMDGPU::XNACK_MASK_HI:
244 llvm_unreachable("xnack_mask registers should not be used");
245
246 case AMDGPU::LDS_DIRECT:
247 llvm_unreachable("lds_direct register should not be used");
248
249 case AMDGPU::TBA:
250 case AMDGPU::TBA_LO:
251 case AMDGPU::TBA_HI:
252 case AMDGPU::TMA:
253 case AMDGPU::TMA_LO:
254 case AMDGPU::TMA_HI:
255 llvm_unreachable("trap handler registers should not be used");
256
257 case AMDGPU::SRC_VCCZ:
258 llvm_unreachable("src_vccz register should not be used");
259
260 case AMDGPU::SRC_EXECZ:
261 llvm_unreachable("src_execz register should not be used");
262
263 case AMDGPU::SRC_SCC:
264 llvm_unreachable("src_scc register should not be used");
265
266 default:
267 break;
268 }
269
270 if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
271 AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
272 AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
273 IsSGPR = true;
274 Width = 1;
275 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
276 AMDGPU::VGPR_16RegClass.contains(Reg)) {
277 IsSGPR = false;
278 Width = 1;
279 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
280 AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
281 IsSGPR = false;
282 IsAGPR = true;
283 Width = 1;
284 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
285 IsSGPR = true;
286 Width = 2;
287 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
288 IsSGPR = false;
289 Width = 2;
290 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
291 IsSGPR = false;
292 IsAGPR = true;
293 Width = 2;
294 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
295 IsSGPR = false;
296 Width = 3;
297 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
298 IsSGPR = true;
299 Width = 3;
300 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
301 IsSGPR = false;
302 IsAGPR = true;
303 Width = 3;
304 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
305 IsSGPR = true;
306 Width = 4;
307 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
308 IsSGPR = false;
309 Width = 4;
310 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
311 IsSGPR = false;
312 IsAGPR = true;
313 Width = 4;
314 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
315 IsSGPR = false;
316 Width = 5;
317 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
318 IsSGPR = true;
319 Width = 5;
320 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
321 IsSGPR = false;
322 IsAGPR = true;
323 Width = 5;
324 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
325 IsSGPR = false;
326 Width = 6;
327 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
328 IsSGPR = true;
329 Width = 6;
330 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
331 IsSGPR = false;
332 IsAGPR = true;
333 Width = 6;
334 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
335 IsSGPR = false;
336 Width = 7;
337 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
338 IsSGPR = true;
339 Width = 7;
340 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
341 IsSGPR = false;
342 IsAGPR = true;
343 Width = 7;
344 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
345 IsSGPR = true;
346 Width = 8;
347 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
348 IsSGPR = false;
349 Width = 8;
350 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
351 IsSGPR = false;
352 IsAGPR = true;
353 Width = 8;
354 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
355 IsSGPR = false;
356 Width = 9;
357 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
358 IsSGPR = true;
359 Width = 9;
360 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
361 IsSGPR = false;
362 IsAGPR = true;
363 Width = 9;
364 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
365 IsSGPR = false;
366 Width = 10;
367 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
368 IsSGPR = true;
369 Width = 10;
370 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
371 IsSGPR = false;
372 IsAGPR = true;
373 Width = 10;
374 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
375 IsSGPR = false;
376 Width = 11;
377 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
378 IsSGPR = true;
379 Width = 11;
380 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
381 IsSGPR = false;
382 IsAGPR = true;
383 Width = 11;
384 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
385 IsSGPR = false;
386 Width = 12;
387 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
388 IsSGPR = true;
389 Width = 12;
390 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
391 IsSGPR = false;
392 IsAGPR = true;
393 Width = 12;
394 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
395 IsSGPR = true;
396 Width = 16;
397 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
398 IsSGPR = false;
399 Width = 16;
400 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
401 IsSGPR = false;
402 IsAGPR = true;
403 Width = 16;
404 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
405 IsSGPR = true;
406 Width = 32;
407 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
408 IsSGPR = false;
409 Width = 32;
410 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
411 IsSGPR = false;
412 IsAGPR = true;
413 Width = 32;
414 } else {
415 // We only expect TTMP registers or registers that do not belong to
416 // any RC.
417 assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
418 AMDGPU::TTMP_64RegClass.contains(Reg) ||
419 AMDGPU::TTMP_128RegClass.contains(Reg) ||
420 AMDGPU::TTMP_256RegClass.contains(Reg) ||
421 AMDGPU::TTMP_512RegClass.contains(Reg) ||
422 !TRI.getPhysRegBaseClass(Reg)) &&
423 "Unknown register class");
424 }
425 unsigned HWReg = TRI.getHWRegIndex(Reg);
426 int MaxUsed = HWReg + Width - 1;
427 if (IsSGPR) {
428 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
429 } else if (IsAGPR) {
430 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
431 } else {
432 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
433 }
434 }
435
436 if (MI.isCall()) {
437 // Pseudo used just to encode the underlying global. Is there a better
438 // way to track this?
439
440 const MachineOperand *CalleeOp =
441 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
442
443 const Function *Callee = getCalleeFunction(*CalleeOp);
444
445 // Avoid crashing on undefined behavior with an illegal call to a
446 // kernel. If a callsite's calling convention doesn't match the
447 // function's, it's undefined behavior. If the callsite calling
448 // convention does match, that would have errored earlier.
449 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
450 report_fatal_error("invalid call to entry function");
451
452 auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
453 return F == &MF.getFunction();
454 };
455
456 if (Callee && !isSameFunction(MF, Callee))
457 Info.Callees.push_back(Callee);
458
459 bool IsIndirect = !Callee || Callee->isDeclaration();
460
461 // FIXME: Call site could have norecurse on it
462 if (!Callee || !Callee->doesNotRecurse()) {
463 Info.HasRecursion = true;
464
465 // TODO: If we happen to know there is no stack usage in the
466 // callgraph, we don't need to assume an infinitely growing stack.
467 if (!MI.isReturn()) {
468 // We don't need to assume an unknown stack size for tail calls.
469
470 // FIXME: This only benefits in the case where the kernel does not
471 // directly call the tail called function. If a kernel directly
472 // calls a tail recursive function, we'll assume maximum stack size
473 // based on the regular call instruction.
474 Info.CalleeSegmentSize = std::max(
475 Info.CalleeSegmentSize,
476 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
477 }
478 }
479
480 if (IsIndirect) {
481 Info.CalleeSegmentSize =
482 std::max(Info.CalleeSegmentSize,
483 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
484
485 // Register usage of indirect calls gets handled later
486 Info.UsesVCC = true;
487 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
488 Info.HasDynamicallySizedStack = true;
489 Info.HasIndirectCall = true;
490 }
491 }
492 }
493 }
494
495 Info.NumExplicitSGPR = MaxSGPR + 1;
496 Info.NumVGPR = MaxVGPR + 1;
497 Info.NumAGPR = MaxAGPR + 1;
498
499 return Info;
500 }
501