1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LiveRegUnits.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18
19 using namespace llvm;
20
21 #define DEBUG_TYPE "frame-info"
22
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
26 cl::ReallyHidden,
27 cl::init(true));
28
29 // Find a register matching \p RC from \p LiveUnits which is unused and
30 // available throughout the function. On failure, returns AMDGPU::NoRegister.
31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32 // MCRegisters. This should reduce the number of iterations and avoid redundant
33 // checking.
findUnusedRegister(MachineRegisterInfo & MRI,const LiveRegUnits & LiveUnits,const TargetRegisterClass & RC)34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(Reg))
40 return Reg;
41 }
42 return MCRegister();
43 }
44
45 // Find a scratch register that we can use in the prologue. We avoid using
46 // callee-save registers since they may appear to be free when this is called
47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
48 // when this is called from emitPrologue.
findScratchNonCalleeSaveRegister(MachineRegisterInfo & MRI,LiveRegUnits & LiveUnits,const TargetRegisterClass & RC,bool Unused=false)49 static MCRegister findScratchNonCalleeSaveRegister(
50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68 }
69
70 /// Query target location for spilling SGPRs
71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs
getVGPRSpillLaneOrTempRegister(MachineFunction & MF,LiveRegUnits & LiveUnits,Register SGPR,const TargetRegisterClass & RC=AMDGPU::SReg_32_XM0_XEXECRegClass,bool IncludeScratchCopy=true)72 static void getVGPRSpillLaneOrTempRegister(
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
95 TargetStackID::SGPRSpill);
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
102 MFI->addToPrologEpilogSGPRSpills(
103 SGPR, PrologEpilogSGPRSaveRestoreInfo(
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
112 MF.getFrameInfo().RemoveStackObject(FI);
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115 MFI->addToPrologEpilogSGPRSpills(
116 SGPR,
117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
122 MFI->addToPrologEpilogSGPRSpills(
123 SGPR, PrologEpilogSGPRSaveRestoreInfo(
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129 }
130
131 // We need to specially emit stack operations here because a different frame
132 // register is used than in the rest of the function, as getFrameRegister would
133 // use.
buildPrologSpill(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LiveRegUnits & LiveUnits,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI,Register FrameReg,int64_t DwordOff=0)134 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
137 MachineBasicBlock &MBB,
138 MachineBasicBlock::iterator I, const DebugLoc &DL,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146 MachineMemOperand *MMO = MF.getMachineMemOperand(
147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148 FrameInfo.getObjectAlign(FI));
149 LiveUnits.addReg(SpillReg);
150 bool IsKill = !MBB.isLiveIn(SpillReg);
151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152 DwordOff, MMO, nullptr, &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(SpillReg);
155 }
156
buildEpilogRestore(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LiveRegUnits & LiveUnits,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI,Register FrameReg,int64_t DwordOff=0)157 static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
161 MachineBasicBlock &MBB,
162 MachineBasicBlock::iterator I,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170 MachineMemOperand *MMO = MF.getMachineMemOperand(
171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172 FrameInfo.getObjectAlign(FI));
173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174 DwordOff, MMO, nullptr, &LiveUnits);
175 }
176
buildGitPtr(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,const SIInstrInfo * TII,Register TargetReg)177 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
180 MachineFunction *MF = MBB.getParent();
181 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(MBB, I, DL, SMovB32, TargetHi)
189 .addImm(MFI->getGITPtrHigh())
190 .addReg(TargetReg, RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(MBB, I, DL, GetPC64, TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196 MF->getRegInfo().addLiveIn(GitPtrLo);
197 MBB.addLiveIn(GitPtrLo);
198 BuildMI(MBB, I, DL, SMovB32, TargetLo)
199 .addReg(GitPtrLo);
200 }
201
initLiveUnits(LiveRegUnits & LiveUnits,const SIRegisterInfo & TRI,const SIMachineFunctionInfo * FuncInfo,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool IsProlog)202 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
204 MachineFunction &MF, MachineBasicBlock &MBB,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(*MBBI);
214 }
215 }
216 }
217
218 namespace llvm {
219
220 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221 // BP, etc. These spills are delayed until the current function's frame is
222 // finalized. For a given register, the builder uses the
223 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224 class PrologEpilogSGPRSpillBuilder {
225 MachineBasicBlock::iterator MI;
226 MachineBasicBlock &MBB;
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
234 const PrologEpilogSGPRSaveRestoreInfo SI;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
saveToMemory(const int FI) const242 void saveToMemory(const int FI) const {
243 MachineRegisterInfo &MRI = MF.getRegInfo();
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247
248 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error("failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258 .addReg(SubReg);
259
260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
saveToVGPRLane(const int FI) const266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
269 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279 Spill[I].VGPR)
280 .addReg(SubReg)
281 .addImm(Spill[I].Lane)
282 .addReg(Spill[I].VGPR, RegState::Undef);
283 }
284 }
285
copyToScratchSGPR(Register DstReg) const286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288 .addReg(SuperReg)
289 .setMIFlag(MachineInstr::FrameSetup);
290 }
291
restoreFromMemory(const int FI)292 void restoreFromMemory(const int FI) {
293 MachineRegisterInfo &MRI = MF.getRegInfo();
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
296 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error("failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307 TmpVGPR, FI, FrameReg, DwordOff);
308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309 .addReg(TmpVGPR, RegState::Kill);
310 DwordOff += 4;
311 }
312 }
313
restoreFromVGPRLane(const int FI)314 void restoreFromVGPRLane(const int FI) {
315 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
316 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
317 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
318 assert(Spill.size() == NumSubRegs);
319
320 for (unsigned I = 0; I < NumSubRegs; ++I) {
321 Register SubReg = NumSubRegs == 1
322 ? SuperReg
323 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325 .addReg(Spill[I].VGPR)
326 .addImm(Spill[I].Lane);
327 }
328 }
329
copyFromScratchSGPR(Register SrcReg) const330 void copyFromScratchSGPR(Register SrcReg) const {
331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332 .addReg(SrcReg)
333 .setMIFlag(MachineInstr::FrameDestroy);
334 }
335
336 public:
PrologEpilogSGPRSpillBuilder(Register Reg,const PrologEpilogSGPRSaveRestoreInfo SI,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,const SIInstrInfo * TII,const SIRegisterInfo & TRI,LiveRegUnits & LiveUnits,Register FrameReg)337 PrologEpilogSGPRSpillBuilder(Register Reg,
338 const PrologEpilogSGPRSaveRestoreInfo SI,
339 MachineBasicBlock &MBB,
340 MachineBasicBlock::iterator MI,
341 const DebugLoc &DL, const SIInstrInfo *TII,
342 const SIRegisterInfo &TRI,
343 LiveRegUnits &LiveUnits, Register FrameReg)
344 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348 FrameReg(FrameReg) {
349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350 SplitParts = TRI.getRegSplitParts(RC, EltSize);
351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352
353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354 }
355
save()356 void save() {
357 switch (SI.getKind()) {
358 case SGPRSaveKind::SPILL_TO_MEM:
359 return saveToMemory(SI.getIndex());
360 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
361 return saveToVGPRLane(SI.getIndex());
362 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
363 return copyToScratchSGPR(SI.getReg());
364 }
365 }
366
restore()367 void restore() {
368 switch (SI.getKind()) {
369 case SGPRSaveKind::SPILL_TO_MEM:
370 return restoreFromMemory(SI.getIndex());
371 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
372 return restoreFromVGPRLane(SI.getIndex());
373 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
374 return copyFromScratchSGPR(SI.getReg());
375 }
376 }
377 };
378
379 } // namespace llvm
380
381 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
emitEntryFunctionFlatScratchInit(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register ScratchWaveOffsetReg) const382 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
383 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
385 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
386 const SIInstrInfo *TII = ST.getInstrInfo();
387 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
388 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
389
390 // We don't need this if we only have spills since there is no user facing
391 // scratch.
392
393 // TODO: If we know we don't have flat instructions earlier, we can omit
394 // this from the input registers.
395 //
396 // TODO: We only need to know if we access scratch space through a flat
397 // pointer. Because we only detect if flat instructions are used at all,
398 // this will be used more often than necessary on VI.
399
400 Register FlatScrInitLo;
401 Register FlatScrInitHi;
402
403 if (ST.isAmdPalOS()) {
404 // Extract the scratch offset from the descriptor in the GIT
405 LiveRegUnits LiveUnits;
406 LiveUnits.init(*TRI);
407 LiveUnits.addLiveIns(MBB);
408
409 // Find unused reg to load flat scratch init into
410 MachineRegisterInfo &MRI = MF.getRegInfo();
411 Register FlatScrInit = AMDGPU::NoRegister;
412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414 AllSGPR64s = AllSGPR64s.slice(
415 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417 for (MCPhysReg Reg : AllSGPR64s) {
418 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
419 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420 FlatScrInit = Reg;
421 break;
422 }
423 }
424 assert(FlatScrInit && "Failed to find free register for scratch init");
425
426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
430
431 // We now have the GIT ptr - now get the scratch descriptor from the entry
432 // at offset 0 (or offset 16 for a compute shader).
433 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435 auto *MMO = MF.getMachineMemOperand(
436 PtrInfo,
437 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
438 MachineMemOperand::MODereferenceable,
439 8, Align(4));
440 unsigned Offset =
441 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
445 .addReg(FlatScrInit)
446 .addImm(EncodedOffset) // offset
447 .addImm(0) // cpol
448 .addMemOperand(MMO);
449
450 // Mask the offset in [47:0] of the descriptor
451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
453 .addReg(FlatScrInitHi)
454 .addImm(0xffff);
455 And->getOperand(3).setIsDead(); // Mark SCC as dead.
456 } else {
457 Register FlatScratchInitReg =
458 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
459 assert(FlatScratchInitReg);
460
461 MachineRegisterInfo &MRI = MF.getRegInfo();
462 MRI.addLiveIn(FlatScratchInitReg);
463 MBB.addLiveIn(FlatScratchInitReg);
464
465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467 }
468
469 // Do a 64-bit pointer add.
470 if (ST.flatScratchIsPointer()) {
471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473 .addReg(FlatScrInitLo)
474 .addReg(ScratchWaveOffsetReg);
475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476 FlatScrInitHi)
477 .addReg(FlatScrInitHi)
478 .addImm(0);
479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480
481 using namespace AMDGPU::Hwreg;
482 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
483 .addReg(FlatScrInitLo)
484 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
486 .addReg(FlatScrInitHi)
487 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
488 return;
489 }
490
491 // For GFX9.
492 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493 .addReg(FlatScrInitLo)
494 .addReg(ScratchWaveOffsetReg);
495 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496 AMDGPU::FLAT_SCR_HI)
497 .addReg(FlatScrInitHi)
498 .addImm(0);
499 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
500
501 return;
502 }
503
504 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505
506 // Copy the size in bytes.
507 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508 .addReg(FlatScrInitHi, RegState::Kill);
509
510 // Add wave offset in bytes to private base offset.
511 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513 .addReg(FlatScrInitLo)
514 .addReg(ScratchWaveOffsetReg);
515
516 // Convert offset to 256-byte units.
517 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518 AMDGPU::FLAT_SCR_HI)
519 .addReg(FlatScrInitLo, RegState::Kill)
520 .addImm(8);
521 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
522 }
523
524 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525 // memory. They should have been removed by now.
allStackObjectsAreDead(const MachineFrameInfo & MFI)526 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
527 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528 I != E; ++I) {
529 if (!MFI.isDeadObjectIndex(I))
530 return false;
531 }
532
533 return true;
534 }
535
536 // Shift down registers reserved for the scratch RSRC.
getEntryFunctionReservedScratchRsrcReg(MachineFunction & MF) const537 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538 MachineFunction &MF) const {
539
540 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
541 const SIInstrInfo *TII = ST.getInstrInfo();
542 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
543 MachineRegisterInfo &MRI = MF.getRegInfo();
544 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
545
546 assert(MFI->isEntryFunction());
547
548 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549
550 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
551 allStackObjectsAreDead(MF.getFrameInfo())))
552 return Register();
553
554 if (ST.hasSGPRInitBug() ||
555 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556 return ScratchRsrcReg;
557
558 // We reserved the last registers for this. Shift it down to the end of those
559 // which were actually used.
560 //
561 // FIXME: It might be safer to use a pseudoregister before replacement.
562
563 // FIXME: We should be able to eliminate unused input registers. We only
564 // cannot do this for the resources required for scratch access. For now we
565 // skip over user SGPRs and may leave unused holes.
566
567 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
568 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
570
571 // Skip the last N reserved elements because they should have already been
572 // reserved for VCC etc.
573 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574 for (MCPhysReg Reg : AllSGPR128s) {
575 // Pick the first unallocated one. Make sure we don't clobber the other
576 // reserved input we needed. Also for PAL, make sure we don't clobber
577 // the GIT pointer passed in SGPR0 or SGPR8.
578 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
579 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580 MRI.replaceRegWith(ScratchRsrcReg, Reg);
581 MFI->setScratchRSrcReg(Reg);
582 MRI.reserveReg(Reg, TRI);
583 return Reg;
584 }
585 }
586
587 return ScratchRsrcReg;
588 }
589
getScratchScaleFactor(const GCNSubtarget & ST)590 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
591 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
592 }
593
emitEntryFunctionPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const594 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
595 MachineBasicBlock &MBB) const {
596 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
597
598 // FIXME: If we only have SGPR spills, we won't actually be using scratch
599 // memory since these spill to VGPRs. We should be cleaning up these unused
600 // SGPR spill frame indices somewhere.
601
602 // FIXME: We still have implicit uses on SGPR spill instructions in case they
603 // need to spill to vector memory. It's likely that will not happen, but at
604 // this point it appears we need the setup. This part of the prolog should be
605 // emitted after frame indices are eliminated.
606
607 // FIXME: Remove all of the isPhysRegUsed checks
608
609 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
610 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
611 const SIInstrInfo *TII = ST.getInstrInfo();
612 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
613 MachineRegisterInfo &MRI = MF.getRegInfo();
614 const Function &F = MF.getFunction();
615 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
616
617 assert(MFI->isEntryFunction());
618
619 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
620 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
621
622 // We need to do the replacement of the private segment buffer register even
623 // if there are no stack objects. There could be stores to undef or a
624 // constant without an associated object.
625 //
626 // This will return `Register()` in cases where there are no actual
627 // uses of the SRSRC.
628 Register ScratchRsrcReg;
629 if (!ST.enableFlatScratch())
630 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
631
632 // Make the selected register live throughout the function.
633 if (ScratchRsrcReg) {
634 for (MachineBasicBlock &OtherBB : MF) {
635 if (&OtherBB != &MBB) {
636 OtherBB.addLiveIn(ScratchRsrcReg);
637 }
638 }
639 }
640
641 // Now that we have fixed the reserved SRSRC we need to locate the
642 // (potentially) preloaded SRSRC.
643 Register PreloadedScratchRsrcReg;
644 if (ST.isAmdHsaOrMesa(F)) {
645 PreloadedScratchRsrcReg =
646 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
647 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
648 // We added live-ins during argument lowering, but since they were not
649 // used they were deleted. We're adding the uses now, so add them back.
650 MRI.addLiveIn(PreloadedScratchRsrcReg);
651 MBB.addLiveIn(PreloadedScratchRsrcReg);
652 }
653 }
654
655 // Debug location must be unknown since the first debug location is used to
656 // determine the end of the prologue.
657 DebugLoc DL;
658 MachineBasicBlock::iterator I = MBB.begin();
659
660 // We found the SRSRC first because it needs four registers and has an
661 // alignment requirement. If the SRSRC that we found is clobbering with
662 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
663 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
664 // wave offset to a free SGPR.
665 Register ScratchWaveOffsetReg;
666 if (PreloadedScratchWaveOffsetReg &&
667 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
668 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
669 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
670 AllSGPRs = AllSGPRs.slice(
671 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
672 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
673 for (MCPhysReg Reg : AllSGPRs) {
674 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
675 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
676 ScratchWaveOffsetReg = Reg;
677 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
678 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
679 break;
680 }
681 }
682
683 // FIXME: We can spill incoming arguments and restore at the end of the
684 // prolog.
685 if (!ScratchWaveOffsetReg)
686 report_fatal_error(
687 "could not find temporary scratch offset register in prolog");
688 } else {
689 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
690 }
691 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
692
693 if (hasFP(MF)) {
694 Register FPReg = MFI->getFrameOffsetReg();
695 assert(FPReg != AMDGPU::FP_REG);
696 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
697 }
698
699 if (requiresStackPointerReference(MF)) {
700 Register SPReg = MFI->getStackPtrOffsetReg();
701 assert(SPReg != AMDGPU::SP_REG);
702 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
703 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
704 }
705
706 bool NeedsFlatScratchInit =
707 MFI->getUserSGPRInfo().hasFlatScratchInit() &&
708 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
709 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
710
711 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
712 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
713 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
714 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
715 }
716
717 if (NeedsFlatScratchInit) {
718 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
719 }
720
721 if (ScratchRsrcReg) {
722 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
723 PreloadedScratchRsrcReg,
724 ScratchRsrcReg, ScratchWaveOffsetReg);
725 }
726 }
727
728 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
emitEntryFunctionScratchRsrcRegSetup(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register PreloadedScratchRsrcReg,Register ScratchRsrcReg,Register ScratchWaveOffsetReg) const729 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
730 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
731 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
732 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
733
734 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
735 const SIInstrInfo *TII = ST.getInstrInfo();
736 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
737 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
738 const Function &Fn = MF.getFunction();
739
740 if (ST.isAmdPalOS()) {
741 // The pointer to the GIT is formed from the offset passed in and either
742 // the amdgpu-git-ptr-high function attribute or the top part of the PC
743 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
744 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
745
746 buildGitPtr(MBB, I, DL, TII, Rsrc01);
747
748 // We now have the GIT ptr - now get the scratch descriptor from the entry
749 // at offset 0 (or offset 16 for a compute shader).
750 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
751 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
752 auto MMO = MF.getMachineMemOperand(PtrInfo,
753 MachineMemOperand::MOLoad |
754 MachineMemOperand::MOInvariant |
755 MachineMemOperand::MODereferenceable,
756 16, Align(4));
757 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
758 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
759 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
760 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
761 .addReg(Rsrc01)
762 .addImm(EncodedOffset) // offset
763 .addImm(0) // cpol
764 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
765 .addMemOperand(MMO);
766
767 // The driver will always set the SRD for wave 64 (bits 118:117 of
768 // descriptor / bits 22:21 of third sub-reg will be 0b11)
769 // If the shader is actually wave32 we have to modify the const_index_stride
770 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
771 // reason the driver does this is that there can be cases where it presents
772 // 2 shaders with different wave size (e.g. VsFs).
773 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
774 if (ST.isWave32()) {
775 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
776 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
777 .addImm(21)
778 .addReg(Rsrc03);
779 }
780 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
781 assert(!ST.isAmdHsaOrMesa(Fn));
782 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
783
784 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
785 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
786
787 // Use relocations to get the pointer, and setup the other bits manually.
788 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
789
790 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
791 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
792
793 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
794 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
795
796 BuildMI(MBB, I, DL, Mov64, Rsrc01)
797 .addReg(MFI->getImplicitBufferPtrUserSGPR())
798 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
799 } else {
800 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
801
802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
803 auto MMO = MF.getMachineMemOperand(
804 PtrInfo,
805 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
806 MachineMemOperand::MODereferenceable,
807 8, Align(4));
808 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
809 .addReg(MFI->getImplicitBufferPtrUserSGPR())
810 .addImm(0) // offset
811 .addImm(0) // cpol
812 .addMemOperand(MMO)
813 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
814
815 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
816 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
817 }
818 } else {
819 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
820 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
821
822 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
823 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
824 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
825
826 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
827 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
828 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
829 }
830
831 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
832 .addImm(Rsrc23 & 0xffffffff)
833 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
834
835 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
836 .addImm(Rsrc23 >> 32)
837 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
838 } else if (ST.isAmdHsaOrMesa(Fn)) {
839 assert(PreloadedScratchRsrcReg);
840
841 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
842 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
843 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
844 }
845 }
846
847 // Add the scratch wave offset into the scratch RSRC.
848 //
849 // We only want to update the first 48 bits, which is the base address
850 // pointer, without touching the adjacent 16 bits of flags. We know this add
851 // cannot carry-out from bit 47, otherwise the scratch allocation would be
852 // impossible to fit in the 48-bit global address space.
853 //
854 // TODO: Evaluate if it is better to just construct an SRD using the flat
855 // scratch init and some constants rather than update the one we are passed.
856 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
857 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
858
859 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
860 // the kernel body via inreg arguments.
861 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
862 .addReg(ScratchRsrcSub0)
863 .addReg(ScratchWaveOffsetReg)
864 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
865 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
866 .addReg(ScratchRsrcSub1)
867 .addImm(0)
868 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
869 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
870 }
871
isSupportedStackID(TargetStackID::Value ID) const872 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
873 switch (ID) {
874 case TargetStackID::Default:
875 case TargetStackID::NoAlloc:
876 case TargetStackID::SGPRSpill:
877 return true;
878 case TargetStackID::ScalableVector:
879 case TargetStackID::WasmLocal:
880 return false;
881 }
882 llvm_unreachable("Invalid TargetStackID::Value");
883 }
884
885 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
886 // Otherwise, activate all lanes. It returns the saved exec.
buildScratchExecCopy(LiveRegUnits & LiveUnits,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool IsProlog,bool EnableInactiveLanes)887 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
888 MachineFunction &MF,
889 MachineBasicBlock &MBB,
890 MachineBasicBlock::iterator MBBI,
891 const DebugLoc &DL, bool IsProlog,
892 bool EnableInactiveLanes) {
893 Register ScratchExecCopy;
894 MachineRegisterInfo &MRI = MF.getRegInfo();
895 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
896 const SIInstrInfo *TII = ST.getInstrInfo();
897 const SIRegisterInfo &TRI = TII->getRegisterInfo();
898 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
899
900 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
901
902 ScratchExecCopy = findScratchNonCalleeSaveRegister(
903 MRI, LiveUnits, *TRI.getWaveMaskRegClass());
904 if (!ScratchExecCopy)
905 report_fatal_error("failed to find free scratch register");
906
907 LiveUnits.addReg(ScratchExecCopy);
908
909 const unsigned SaveExecOpc =
910 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
911 : AMDGPU::S_OR_SAVEEXEC_B32)
912 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
913 : AMDGPU::S_OR_SAVEEXEC_B64);
914 auto SaveExec =
915 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
916 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
917
918 return ScratchExecCopy;
919 }
920
emitCSRSpillStores(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,DebugLoc & DL,LiveRegUnits & LiveUnits,Register FrameReg,Register FramePtrRegScratchCopy) const921 void SIFrameLowering::emitCSRSpillStores(
922 MachineFunction &MF, MachineBasicBlock &MBB,
923 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
924 Register FrameReg, Register FramePtrRegScratchCopy) const {
925 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
926 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 const SIRegisterInfo &TRI = TII->getRegisterInfo();
929
930 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
931 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
932 // might end up flipping the EXEC bits twice.
933 Register ScratchExecCopy;
934 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
935 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
936 if (!WWMScratchRegs.empty())
937 ScratchExecCopy =
938 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
939 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
940
941 auto StoreWWMRegisters =
942 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
943 for (const auto &Reg : WWMRegs) {
944 Register VGPR = Reg.first;
945 int FI = Reg.second;
946 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
947 VGPR, FI, FrameReg);
948 }
949 };
950
951 StoreWWMRegisters(WWMScratchRegs);
952 if (!WWMCalleeSavedRegs.empty()) {
953 if (ScratchExecCopy) {
954 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
955 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
956 } else {
957 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
958 /*IsProlog*/ true,
959 /*EnableInactiveLanes*/ false);
960 }
961 }
962
963 StoreWWMRegisters(WWMCalleeSavedRegs);
964 if (ScratchExecCopy) {
965 // FIXME: Split block and make terminator.
966 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
967 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
968 .addReg(ScratchExecCopy, RegState::Kill);
969 LiveUnits.addReg(ScratchExecCopy);
970 }
971
972 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
973
974 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
975 // Special handle FP spill:
976 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
977 // Otherwise, FP has been moved to a temporary register and spill it
978 // instead.
979 Register Reg =
980 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
981 if (!Reg)
982 continue;
983
984 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
985 LiveUnits, FrameReg);
986 SB.save();
987 }
988
989 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
990 // such scratch registers live throughout the function.
991 SmallVector<Register, 1> ScratchSGPRs;
992 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
993 if (!ScratchSGPRs.empty()) {
994 for (MachineBasicBlock &MBB : MF) {
995 for (MCPhysReg Reg : ScratchSGPRs)
996 MBB.addLiveIn(Reg);
997
998 MBB.sortUniqueLiveIns();
999 }
1000 if (!LiveUnits.empty()) {
1001 for (MCPhysReg Reg : ScratchSGPRs)
1002 LiveUnits.addReg(Reg);
1003 }
1004 }
1005 }
1006
emitCSRSpillRestores(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,DebugLoc & DL,LiveRegUnits & LiveUnits,Register FrameReg,Register FramePtrRegScratchCopy) const1007 void SIFrameLowering::emitCSRSpillRestores(
1008 MachineFunction &MF, MachineBasicBlock &MBB,
1009 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1010 Register FrameReg, Register FramePtrRegScratchCopy) const {
1011 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1012 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1013 const SIInstrInfo *TII = ST.getInstrInfo();
1014 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1015 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1016
1017 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1018 // Special handle FP restore:
1019 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1020 // the FP value to a temporary register. The frame pointer should be
1021 // overwritten only at the end when all other spills are restored from
1022 // current frame.
1023 Register Reg =
1024 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1025 if (!Reg)
1026 continue;
1027
1028 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1029 LiveUnits, FrameReg);
1030 SB.restore();
1031 }
1032
1033 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1034 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1035 // this, we might end up flipping the EXEC bits twice.
1036 Register ScratchExecCopy;
1037 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1038 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1039 if (!WWMScratchRegs.empty())
1040 ScratchExecCopy =
1041 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1042 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1043
1044 auto RestoreWWMRegisters =
1045 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1046 for (const auto &Reg : WWMRegs) {
1047 Register VGPR = Reg.first;
1048 int FI = Reg.second;
1049 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1050 VGPR, FI, FrameReg);
1051 }
1052 };
1053
1054 RestoreWWMRegisters(WWMScratchRegs);
1055 if (!WWMCalleeSavedRegs.empty()) {
1056 if (ScratchExecCopy) {
1057 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1058 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1059 } else {
1060 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1061 /*IsProlog*/ false,
1062 /*EnableInactiveLanes*/ false);
1063 }
1064 }
1065
1066 RestoreWWMRegisters(WWMCalleeSavedRegs);
1067 if (ScratchExecCopy) {
1068 // FIXME: Split block and make terminator.
1069 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1070 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1071 .addReg(ScratchExecCopy, RegState::Kill);
1072 }
1073 }
1074
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1075 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1076 MachineBasicBlock &MBB) const {
1077 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1078 if (FuncInfo->isEntryFunction()) {
1079 emitEntryFunctionPrologue(MF, MBB);
1080 return;
1081 }
1082
1083 MachineFrameInfo &MFI = MF.getFrameInfo();
1084 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1085 const SIInstrInfo *TII = ST.getInstrInfo();
1086 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1087 MachineRegisterInfo &MRI = MF.getRegInfo();
1088
1089 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1090 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1091 Register BasePtrReg =
1092 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1093 LiveRegUnits LiveUnits;
1094
1095 MachineBasicBlock::iterator MBBI = MBB.begin();
1096 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1097 // to determine the end of the prologue.
1098 DebugLoc DL;
1099
1100 if (FuncInfo->isChainFunction()) {
1101 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1102 // are free to set one up if they need it.
1103 bool UseSP = requiresStackPointerReference(MF);
1104 if (UseSP) {
1105 assert(StackPtrReg != AMDGPU::SP_REG);
1106
1107 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1108 .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1109 }
1110 }
1111
1112 bool HasFP = false;
1113 bool HasBP = false;
1114 uint32_t NumBytes = MFI.getStackSize();
1115 uint32_t RoundedSize = NumBytes;
1116
1117 if (TRI.hasStackRealignment(MF))
1118 HasFP = true;
1119
1120 Register FramePtrRegScratchCopy;
1121 if (!HasFP && !hasFP(MF)) {
1122 // Emit the CSR spill stores with SP base register.
1123 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1124 FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1125 FramePtrRegScratchCopy);
1126 } else {
1127 // CSR spill stores will use FP as base register.
1128 Register SGPRForFPSaveRestoreCopy =
1129 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1130
1131 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1132 if (SGPRForFPSaveRestoreCopy) {
1133 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1134 // the extra FP copy needed in the other two cases when FP is spilled to
1135 // memory or to a VGPR lane.
1136 PrologEpilogSGPRSpillBuilder SB(
1137 FramePtrReg,
1138 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1139 DL, TII, TRI, LiveUnits, FramePtrReg);
1140 SB.save();
1141 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1142 } else {
1143 // Copy FP into a new scratch register so that its previous value can be
1144 // spilled after setting up the new frame.
1145 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1146 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1147 if (!FramePtrRegScratchCopy)
1148 report_fatal_error("failed to find free scratch register");
1149
1150 LiveUnits.addReg(FramePtrRegScratchCopy);
1151 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1152 .addReg(FramePtrReg);
1153 }
1154 }
1155
1156 if (HasFP) {
1157 const unsigned Alignment = MFI.getMaxAlign().value();
1158
1159 RoundedSize += Alignment;
1160 if (LiveUnits.empty()) {
1161 LiveUnits.init(TRI);
1162 LiveUnits.addLiveIns(MBB);
1163 }
1164
1165 // s_add_i32 s33, s32, NumBytes
1166 // s_and_b32 s33, s33, 0b111...0000
1167 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1168 .addReg(StackPtrReg)
1169 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1170 .setMIFlag(MachineInstr::FrameSetup);
1171 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1172 .addReg(FramePtrReg, RegState::Kill)
1173 .addImm(-Alignment * getScratchScaleFactor(ST))
1174 .setMIFlag(MachineInstr::FrameSetup);
1175 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1176 FuncInfo->setIsStackRealigned(true);
1177 } else if ((HasFP = hasFP(MF))) {
1178 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1179 .addReg(StackPtrReg)
1180 .setMIFlag(MachineInstr::FrameSetup);
1181 }
1182
1183 // If FP is used, emit the CSR spills with FP base register.
1184 if (HasFP) {
1185 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1186 FramePtrRegScratchCopy);
1187 if (FramePtrRegScratchCopy)
1188 LiveUnits.removeReg(FramePtrRegScratchCopy);
1189 }
1190
1191 // If we need a base pointer, set it up here. It's whatever the value of
1192 // the stack pointer is at this point. Any variable size objects will be
1193 // allocated after this, so we can still use the base pointer to reference
1194 // the incoming arguments.
1195 if ((HasBP = TRI.hasBasePointer(MF))) {
1196 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1197 .addReg(StackPtrReg)
1198 .setMIFlag(MachineInstr::FrameSetup);
1199 }
1200
1201 if (HasFP && RoundedSize != 0) {
1202 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1203 .addReg(StackPtrReg)
1204 .addImm(RoundedSize * getScratchScaleFactor(ST))
1205 .setMIFlag(MachineInstr::FrameSetup);
1206 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1207 }
1208
1209 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1210 (void)FPSaved;
1211 assert((!HasFP || FPSaved) &&
1212 "Needed to save FP but didn't save it anywhere");
1213
1214 // If we allow spilling to AGPRs we may have saved FP but then spill
1215 // everything into AGPRs instead of the stack.
1216 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1217 "Saved FP but didn't need it");
1218
1219 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1220 (void)BPSaved;
1221 assert((!HasBP || BPSaved) &&
1222 "Needed to save BP but didn't save it anywhere");
1223
1224 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1225 }
1226
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1227 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1228 MachineBasicBlock &MBB) const {
1229 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1230 if (FuncInfo->isEntryFunction())
1231 return;
1232
1233 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1234 const SIInstrInfo *TII = ST.getInstrInfo();
1235 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1236 MachineRegisterInfo &MRI = MF.getRegInfo();
1237 LiveRegUnits LiveUnits;
1238 // Get the insert location for the epilogue. If there were no terminators in
1239 // the block, get the last instruction.
1240 MachineBasicBlock::iterator MBBI = MBB.end();
1241 DebugLoc DL;
1242 if (!MBB.empty()) {
1243 MBBI = MBB.getLastNonDebugInstr();
1244 if (MBBI != MBB.end())
1245 DL = MBBI->getDebugLoc();
1246
1247 MBBI = MBB.getFirstTerminator();
1248 }
1249
1250 const MachineFrameInfo &MFI = MF.getFrameInfo();
1251 uint32_t NumBytes = MFI.getStackSize();
1252 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1253 ? NumBytes + MFI.getMaxAlign().value()
1254 : NumBytes;
1255 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1256 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1257 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1258
1259 Register FramePtrRegScratchCopy;
1260 Register SGPRForFPSaveRestoreCopy =
1261 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1262 if (FPSaved) {
1263 // CSR spill restores should use FP as base register. If
1264 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1265 // into a new scratch register and copy to FP later when other registers are
1266 // restored from the current stack frame.
1267 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1268 if (SGPRForFPSaveRestoreCopy) {
1269 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1270 } else {
1271 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1272 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1273 if (!FramePtrRegScratchCopy)
1274 report_fatal_error("failed to find free scratch register");
1275
1276 LiveUnits.addReg(FramePtrRegScratchCopy);
1277 }
1278
1279 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1280 FramePtrRegScratchCopy);
1281 }
1282
1283 if (RoundedSize != 0 && hasFP(MF)) {
1284 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1285 .addReg(StackPtrReg)
1286 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1287 .setMIFlag(MachineInstr::FrameDestroy);
1288 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1289 }
1290
1291 if (FPSaved) {
1292 // Insert the copy to restore FP.
1293 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1294 : FramePtrRegScratchCopy;
1295 MachineInstrBuilder MIB =
1296 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1297 .addReg(SrcReg);
1298 if (SGPRForFPSaveRestoreCopy)
1299 MIB.setMIFlag(MachineInstr::FrameDestroy);
1300 } else {
1301 // Insert the CSR spill restores with SP as the base register.
1302 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1303 FramePtrRegScratchCopy);
1304 }
1305 }
1306
1307 #ifndef NDEBUG
allSGPRSpillsAreDead(const MachineFunction & MF)1308 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1309 const MachineFrameInfo &MFI = MF.getFrameInfo();
1310 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1311 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1312 I != E; ++I) {
1313 if (!MFI.isDeadObjectIndex(I) &&
1314 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1315 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1316 return false;
1317 }
1318 }
1319
1320 return true;
1321 }
1322 #endif
1323
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1324 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1325 int FI,
1326 Register &FrameReg) const {
1327 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1328
1329 FrameReg = RI->getFrameRegister(MF);
1330 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1331 }
1332
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const1333 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1334 MachineFunction &MF,
1335 RegScavenger *RS) const {
1336 MachineFrameInfo &MFI = MF.getFrameInfo();
1337
1338 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1339 const SIInstrInfo *TII = ST.getInstrInfo();
1340 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1341 MachineRegisterInfo &MRI = MF.getRegInfo();
1342 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1343
1344 // Allocate spill slots for WWM reserved VGPRs.
1345 // For chain functions, we only need to do this if we have calls to
1346 // llvm.amdgcn.cs.chain.
1347 bool IsChainWithoutCalls =
1348 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1350 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1351 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1352 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1353 TRI->getSpillAlign(*RC));
1354 }
1355 }
1356
1357 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1358 && EnableSpillVGPRToAGPR;
1359
1360 if (SpillVGPRToAGPR) {
1361 // To track the spill frame indices handled in this pass.
1362 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1363 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1364
1365 bool SeenDbgInstr = false;
1366
1367 for (MachineBasicBlock &MBB : MF) {
1368 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1369 int FrameIndex;
1370 if (MI.isDebugInstr())
1371 SeenDbgInstr = true;
1372
1373 if (TII->isVGPRSpill(MI)) {
1374 // Try to eliminate stack used by VGPR spills before frame
1375 // finalization.
1376 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1377 AMDGPU::OpName::vaddr);
1378 int FI = MI.getOperand(FIOp).getIndex();
1379 Register VReg =
1380 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1381 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1382 TRI->isAGPR(MRI, VReg))) {
1383 assert(RS != nullptr);
1384 RS->enterBasicBlockEnd(MBB);
1385 RS->backward(std::next(MI.getIterator()));
1386 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1387 SpillFIs.set(FI);
1388 continue;
1389 }
1390 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1391 TII->isLoadFromStackSlot(MI, FrameIndex))
1392 if (!MFI.isFixedObjectIndex(FrameIndex))
1393 NonVGPRSpillFIs.set(FrameIndex);
1394 }
1395 }
1396
1397 // Stack slot coloring may assign different objects to the same stack slot.
1398 // If not, then the VGPR to AGPR spill slot is dead.
1399 for (unsigned FI : SpillFIs.set_bits())
1400 if (!NonVGPRSpillFIs.test(FI))
1401 FuncInfo->setVGPRToAGPRSpillDead(FI);
1402
1403 for (MachineBasicBlock &MBB : MF) {
1404 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1405 MBB.addLiveIn(Reg);
1406
1407 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1408 MBB.addLiveIn(Reg);
1409
1410 MBB.sortUniqueLiveIns();
1411
1412 if (!SpillFIs.empty() && SeenDbgInstr) {
1413 // FIXME: The dead frame indices are replaced with a null register from
1414 // the debug value instructions. We should instead, update it with the
1415 // correct register value. But not sure the register value alone is
1416 for (MachineInstr &MI : MBB) {
1417 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1418 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1419 SpillFIs[MI.getOperand(0).getIndex()]) {
1420 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1421 }
1422 }
1423 }
1424 }
1425 }
1426
1427 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1428 // can. Any remaining SGPR spills will go to memory, so move them back to the
1429 // default stack.
1430 bool HaveSGPRToVMemSpill =
1431 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1432 assert(allSGPRSpillsAreDead(MF) &&
1433 "SGPR spill should have been removed in SILowerSGPRSpills");
1434
1435 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1436 // but currently hasNonSpillStackObjects is set only from source
1437 // allocas. Stack temps produced from legalization are not counted currently.
1438 if (!allStackObjectsAreDead(MFI)) {
1439 assert(RS && "RegScavenger required if spilling");
1440
1441 // Add an emergency spill slot
1442 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1443
1444 // If we are spilling SGPRs to memory with a large frame, we may need a
1445 // second VGPR emergency frame index.
1446 if (HaveSGPRToVMemSpill &&
1447 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1448 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1449 }
1450 }
1451 }
1452
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS) const1453 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1454 MachineFunction &MF, RegScavenger *RS) const {
1455 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1456 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1457 MachineRegisterInfo &MRI = MF.getRegInfo();
1458 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1459
1460 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1461 // On gfx908, we had initially reserved highest available VGPR for AGPR
1462 // copy. Now since we are done with RA, check if there exist an unused VGPR
1463 // which is lower than the eariler reserved VGPR before RA. If one exist,
1464 // use it for AGPR copy instead of one reserved before RA.
1465 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1466 Register UnusedLowVGPR =
1467 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1468 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1469 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1470 // Reserve this newly identified VGPR (for AGPR copy)
1471 // reserved registers should already be frozen at this point
1472 // so we can avoid calling MRI.freezeReservedRegs and just use
1473 // MRI.reserveReg
1474 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1475 MRI.reserveReg(UnusedLowVGPR, TRI);
1476 }
1477 }
1478 // We initally reserved the highest available SGPR pair for long branches
1479 // now, after RA, we shift down to a lower unused one if one exists
1480 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1481 Register UnusedLowSGPR =
1482 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1483 // If LongBranchReservedReg is null then we didn't find a long branch
1484 // and never reserved a register to begin with so there is nothing to
1485 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1486 // register to use so just keep the original one we set.
1487 if (LongBranchReservedReg && UnusedLowSGPR) {
1488 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1489 MRI.reserveReg(UnusedLowSGPR, TRI);
1490 }
1491 }
1492
1493 // The special SGPR spills like the one needed for FP, BP or any reserved
1494 // registers delayed until frame lowering.
determinePrologEpilogSGPRSaves(MachineFunction & MF,BitVector & SavedVGPRs,bool NeedExecCopyReservedReg) const1495 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1496 MachineFunction &MF, BitVector &SavedVGPRs,
1497 bool NeedExecCopyReservedReg) const {
1498 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1499 MachineRegisterInfo &MRI = MF.getRegInfo();
1500 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1501 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1502 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1503 LiveRegUnits LiveUnits;
1504 LiveUnits.init(*TRI);
1505 // Initially mark callee saved registers as used so we will not choose them
1506 // while looking for scratch SGPRs.
1507 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1508 for (unsigned I = 0; CSRegs[I]; ++I)
1509 LiveUnits.addReg(CSRegs[I]);
1510
1511 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1512
1513 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1514 if (NeedExecCopyReservedReg ||
1515 (ReservedRegForExecCopy &&
1516 MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1517 MRI.reserveReg(ReservedRegForExecCopy, TRI);
1518 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1519 if (UnusedScratchReg) {
1520 // If found any unused scratch SGPR, reserve the register itself for Exec
1521 // copy and there is no need for any spills in that case.
1522 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1523 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1524 LiveUnits.addReg(UnusedScratchReg);
1525 } else {
1526 // Needs spill.
1527 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1528 "Re-reserving spill slot for EXEC copy register");
1529 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
1530 /*IncludeScratchCopy=*/false);
1531 }
1532 } else if (ReservedRegForExecCopy) {
1533 // Reset it at this point. There are no whole-wave copies and spills
1534 // encountered.
1535 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1536 }
1537
1538 // hasFP only knows about stack objects that already exist. We're now
1539 // determining the stack slots that will be created, so we have to predict
1540 // them. Stack objects force FP usage with calls.
1541 //
1542 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1543 // don't want to report it here.
1544 //
1545 // FIXME: Is this really hasReservedCallFrame?
1546 const bool WillHaveFP =
1547 FrameInfo.hasCalls() &&
1548 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1549
1550 if (WillHaveFP || hasFP(MF)) {
1551 Register FramePtrReg = MFI->getFrameOffsetReg();
1552 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1553 "Re-reserving spill slot for FP");
1554 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1555 }
1556
1557 if (TRI->hasBasePointer(MF)) {
1558 Register BasePtrReg = TRI->getBaseRegister();
1559 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1560 "Re-reserving spill slot for BP");
1561 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1562 }
1563 }
1564
1565 // Only report VGPRs to generic code.
determineCalleeSaves(MachineFunction & MF,BitVector & SavedVGPRs,RegScavenger * RS) const1566 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1567 BitVector &SavedVGPRs,
1568 RegScavenger *RS) const {
1569 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1570
1571 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1572 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1573 // we don't need to save and restore anything.
1574 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1575 return;
1576
1577 MFI->shiftSpillPhysVGPRsToLowestRange(MF);
1578
1579 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1580 if (MFI->isEntryFunction())
1581 return;
1582
1583 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1584 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1585 const SIInstrInfo *TII = ST.getInstrInfo();
1586 bool NeedExecCopyReservedReg = false;
1587
1588 MachineInstr *ReturnMI = nullptr;
1589 for (MachineBasicBlock &MBB : MF) {
1590 for (MachineInstr &MI : MBB) {
1591 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1592 // lanes of VGPRs and callee must spill and restore them even if they are
1593 // marked Caller-saved.
1594
1595 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1596 // here would be a bad heuristic. A better way should be by calling
1597 // allocateWWMSpill during the regalloc pipeline whenever a physical
1598 // register is allocated for the intended virtual registers.
1599 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1600 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1601 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1602 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1603 else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1604 NeedExecCopyReservedReg = true;
1605 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1606 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1607 (MFI->isChainFunction() &&
1608 TII->isChainCallOpcode(MI.getOpcode()))) {
1609 // We expect all return to be the same size.
1610 assert(!ReturnMI ||
1611 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1612 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1613 ReturnMI = &MI;
1614 }
1615 }
1616 }
1617
1618 // Remove any VGPRs used in the return value because these do not need to be saved.
1619 // This prevents CSR restore from clobbering return VGPRs.
1620 if (ReturnMI) {
1621 for (auto &Op : ReturnMI->operands()) {
1622 if (Op.isReg())
1623 SavedVGPRs.reset(Op.getReg());
1624 }
1625 }
1626
1627 // Ignore the SGPRs the default implementation found.
1628 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1629
1630 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1631 // In gfx908 there was do AGPR loads and stores and thus spilling also
1632 // require a temporary VGPR.
1633 if (!ST.hasGFX90AInsts())
1634 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1635
1636 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1637
1638 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1639 // allow the default insertion to handle them.
1640 for (auto &Reg : MFI->getWWMSpills())
1641 SavedVGPRs.reset(Reg.first);
1642
1643 // Mark all lane VGPRs as BB LiveIns.
1644 for (MachineBasicBlock &MBB : MF) {
1645 for (auto &Reg : MFI->getWWMSpills())
1646 MBB.addLiveIn(Reg.first);
1647
1648 MBB.sortUniqueLiveIns();
1649 }
1650 }
1651
determineCalleeSavesSGPR(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const1652 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1653 BitVector &SavedRegs,
1654 RegScavenger *RS) const {
1655 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1656 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1657 if (MFI->isEntryFunction())
1658 return;
1659
1660 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1661 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1662
1663 // The SP is specifically managed and we don't want extra spills of it.
1664 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1665
1666 const BitVector AllSavedRegs = SavedRegs;
1667 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1668
1669 // We have to anticipate introducing CSR VGPR spills or spill of caller
1670 // save VGPR reserved for SGPR spills as we now always create stack entry
1671 // for it, if we don't have any stack objects already, since we require a FP
1672 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1673 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1674 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1675 const bool WillHaveFP =
1676 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1677
1678 // FP will be specially managed like SP.
1679 if (WillHaveFP || hasFP(MF))
1680 SavedRegs.reset(MFI->getFrameOffsetReg());
1681
1682 // Return address use with return instruction is hidden through the SI_RETURN
1683 // pseudo. Given that and since the IPRA computes actual register usage and
1684 // does not use CSR list, the clobbering of return address by function calls
1685 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1686 // usage collection. This will ensure save/restore of return address happens
1687 // in those scenarios.
1688 const MachineRegisterInfo &MRI = MF.getRegInfo();
1689 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1690 if (!MFI->isEntryFunction() &&
1691 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1692 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1693 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1694 }
1695 }
1696
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * TRI,std::vector<CalleeSavedInfo> & CSI) const1697 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1698 MachineFunction &MF, const TargetRegisterInfo *TRI,
1699 std::vector<CalleeSavedInfo> &CSI) const {
1700 if (CSI.empty())
1701 return true; // Early exit if no callee saved registers are modified!
1702
1703 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1704 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1705 const SIRegisterInfo *RI = ST.getRegisterInfo();
1706 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1707 Register BasePtrReg = RI->getBaseRegister();
1708 Register SGPRForFPSaveRestoreCopy =
1709 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1710 Register SGPRForBPSaveRestoreCopy =
1711 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1712 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1713 return false;
1714
1715 unsigned NumModifiedRegs = 0;
1716
1717 if (SGPRForFPSaveRestoreCopy)
1718 NumModifiedRegs++;
1719 if (SGPRForBPSaveRestoreCopy)
1720 NumModifiedRegs++;
1721
1722 for (auto &CS : CSI) {
1723 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1724 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1725 if (--NumModifiedRegs)
1726 break;
1727 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1728 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1729 if (--NumModifiedRegs)
1730 break;
1731 }
1732 }
1733
1734 return false;
1735 }
1736
allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction & MF) const1737 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1738 const MachineFunction &MF) const {
1739
1740 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1741 const MachineFrameInfo &MFI = MF.getFrameInfo();
1742 const SIInstrInfo *TII = ST.getInstrInfo();
1743 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1744 uint64_t MaxOffset = EstStackSize - 1;
1745
1746 // We need the emergency stack slots to be allocated in range of the
1747 // MUBUF/flat scratch immediate offset from the base register, so assign these
1748 // first at the incoming SP position.
1749 //
1750 // TODO: We could try sorting the objects to find a hole in the first bytes
1751 // rather than allocating as close to possible. This could save a lot of space
1752 // on frames with alignment requirements.
1753 if (ST.enableFlatScratch()) {
1754 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1755 SIInstrFlags::FlatScratch))
1756 return false;
1757 } else {
1758 if (TII->isLegalMUBUFImmOffset(MaxOffset))
1759 return false;
1760 }
1761
1762 return true;
1763 }
1764
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const1765 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1766 MachineFunction &MF,
1767 MachineBasicBlock &MBB,
1768 MachineBasicBlock::iterator I) const {
1769 int64_t Amount = I->getOperand(0).getImm();
1770 if (Amount == 0)
1771 return MBB.erase(I);
1772
1773 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1774 const SIInstrInfo *TII = ST.getInstrInfo();
1775 const DebugLoc &DL = I->getDebugLoc();
1776 unsigned Opc = I->getOpcode();
1777 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1778 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1779
1780 if (!hasReservedCallFrame(MF)) {
1781 Amount = alignTo(Amount, getStackAlign());
1782 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1783 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1784 Register SPReg = MFI->getStackPtrOffsetReg();
1785
1786 Amount *= getScratchScaleFactor(ST);
1787 if (IsDestroy)
1788 Amount = -Amount;
1789 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1790 .addReg(SPReg)
1791 .addImm(Amount);
1792 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1793 } else if (CalleePopAmount != 0) {
1794 llvm_unreachable("is this used?");
1795 }
1796
1797 return MBB.erase(I);
1798 }
1799
1800 /// Returns true if the frame will require a reference to the stack pointer.
1801 ///
1802 /// This is the set of conditions common to setting up the stack pointer in a
1803 /// kernel, and for using a frame pointer in a callable function.
1804 ///
1805 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1806 /// references SP.
frameTriviallyRequiresSP(const MachineFrameInfo & MFI)1807 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1808 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1809 }
1810
1811 // The FP for kernels is always known 0, so we never really need to setup an
1812 // explicit register for it. However, DisableFramePointerElim will force us to
1813 // use a register for it.
hasFP(const MachineFunction & MF) const1814 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1815 const MachineFrameInfo &MFI = MF.getFrameInfo();
1816
1817 // For entry & chain functions we can use an immediate offset in most cases,
1818 // so the presence of calls doesn't imply we need a distinct frame pointer.
1819 if (MFI.hasCalls() &&
1820 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1821 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
1822 // All offsets are unsigned, so need to be addressed in the same direction
1823 // as stack growth.
1824
1825 // FIXME: This function is pretty broken, since it can be called before the
1826 // frame layout is determined or CSR spills are inserted.
1827 return MFI.getStackSize() != 0;
1828 }
1829
1830 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1831 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1832 MF) ||
1833 MF.getTarget().Options.DisableFramePointerElim(MF);
1834 }
1835
1836 // This is essentially a reduced version of hasFP for entry functions. Since the
1837 // stack pointer is known 0 on entry to kernels, we never really need an FP
1838 // register. We may need to initialize the stack pointer depending on the frame
1839 // properties, which logically overlaps many of the cases where an ordinary
1840 // function would require an FP.
1841 // Also used for chain functions. While not technically entry functions, chain
1842 // functions may need to set up a stack pointer in some situations.
requiresStackPointerReference(const MachineFunction & MF) const1843 bool SIFrameLowering::requiresStackPointerReference(
1844 const MachineFunction &MF) const {
1845 // Callable functions always require a stack pointer reference.
1846 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
1847 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
1848 "only expected to call this for entry points and chain functions");
1849
1850 const MachineFrameInfo &MFI = MF.getFrameInfo();
1851
1852 // Entry points ordinarily don't need to initialize SP. We have to set it up
1853 // for callees if there are any. Also note tail calls are impossible/don't
1854 // make any sense for kernels.
1855 if (MFI.hasCalls())
1856 return true;
1857
1858 // We still need to initialize the SP if we're doing anything weird that
1859 // references the SP, like variable sized stack objects.
1860 return frameTriviallyRequiresSP(MFI);
1861 }
1862