xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LivePhysRegs.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "frame-info"
22 
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24   "amdgpu-spill-vgpr-to-agpr",
25   cl::desc("Enable spilling VGPRs to AGPRs"),
26   cl::ReallyHidden,
27   cl::init(true));
28 
29 // Find a scratch register that we can use in the prologue. We avoid using
30 // callee-save registers since they may appear to be free when this is called
31 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
32 // when this is called from emitPrologue.
33 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
34                                                    LivePhysRegs &LiveRegs,
35                                                    const TargetRegisterClass &RC,
36                                                    bool Unused = false) {
37   // Mark callee saved registers as used so we will not choose them.
38   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
39   for (unsigned i = 0; CSRegs[i]; ++i)
40     LiveRegs.addReg(CSRegs[i]);
41 
42   if (Unused) {
43     // We are looking for a register that can be used throughout the entire
44     // function, so any use is unacceptable.
45     for (MCRegister Reg : RC) {
46       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
47         return Reg;
48     }
49   } else {
50     for (MCRegister Reg : RC) {
51       if (LiveRegs.available(MRI, Reg))
52         return Reg;
53     }
54   }
55 
56   return MCRegister();
57 }
58 
59 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
60                                            LivePhysRegs &LiveRegs,
61                                            Register &TempSGPR,
62                                            Optional<int> &FrameIndex,
63                                            bool IsFP) {
64   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
65   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
66 
67   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
68   const SIRegisterInfo *TRI = ST.getRegisterInfo();
69 
70   // We need to save and restore the current FP/BP.
71 
72   // 1: If there is already a VGPR with free lanes, use it. We
73   // may already have to pay the penalty for spilling a CSR VGPR.
74   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
75     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
76                                             TargetStackID::SGPRSpill);
77 
78     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
79       llvm_unreachable("allocate SGPR spill should have worked");
80 
81     FrameIndex = NewFI;
82 
83     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
84                dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
85                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
86                       << '\n');
87     return;
88   }
89 
90   // 2: Next, try to save the FP/BP in an unused SGPR.
91   TempSGPR = findScratchNonCalleeSaveRegister(
92       MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
93 
94   if (!TempSGPR) {
95     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
96                                             TargetStackID::SGPRSpill);
97 
98     if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
99       // 3: There's no free lane to spill, and no free register to save FP/BP,
100       // so we're forced to spill another VGPR to use for the spill.
101       FrameIndex = NewFI;
102 
103       LLVM_DEBUG(
104           auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
105           dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
106                  << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
107     } else {
108       // Remove dead <NewFI> index
109       MF.getFrameInfo().RemoveStackObject(NewFI);
110       // 4: If all else fails, spill the FP/BP to memory.
111       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
112       LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling "
113                         << (IsFP ? "FP" : "BP") << '\n');
114     }
115   } else {
116     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
117                       << printReg(TempSGPR, TRI) << '\n');
118   }
119 }
120 
121 // We need to specially emit stack operations here because a different frame
122 // register is used than in the rest of the function, as getFrameRegister would
123 // use.
124 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
125                              const SIMachineFunctionInfo &FuncInfo,
126                              LivePhysRegs &LiveRegs, MachineFunction &MF,
127                              MachineBasicBlock &MBB,
128                              MachineBasicBlock::iterator I, const DebugLoc &DL,
129                              Register SpillReg, int FI) {
130   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
131                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
132 
133   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
134   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
135   MachineMemOperand *MMO = MF.getMachineMemOperand(
136       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
137       FrameInfo.getObjectAlign(FI));
138   LiveRegs.addReg(SpillReg);
139   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, true,
140                           FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
141                           &LiveRegs);
142   LiveRegs.removeReg(SpillReg);
143 }
144 
145 static void buildEpilogRestore(const GCNSubtarget &ST,
146                                const SIRegisterInfo &TRI,
147                                const SIMachineFunctionInfo &FuncInfo,
148                                LivePhysRegs &LiveRegs, MachineFunction &MF,
149                                MachineBasicBlock &MBB,
150                                MachineBasicBlock::iterator I,
151                                const DebugLoc &DL, Register SpillReg, int FI) {
152   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
153                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
154 
155   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
156   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
157   MachineMemOperand *MMO = MF.getMachineMemOperand(
158       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
159       FrameInfo.getObjectAlign(FI));
160   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false,
161                           FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
162                           &LiveRegs);
163 }
164 
165 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
166                         const DebugLoc &DL, const SIInstrInfo *TII,
167                         Register TargetReg) {
168   MachineFunction *MF = MBB.getParent();
169   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
170   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
171   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
172   Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
173   Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
174 
175   if (MFI->getGITPtrHigh() != 0xffffffff) {
176     BuildMI(MBB, I, DL, SMovB32, TargetHi)
177         .addImm(MFI->getGITPtrHigh())
178         .addReg(TargetReg, RegState::ImplicitDefine);
179   } else {
180     const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
181     BuildMI(MBB, I, DL, GetPC64, TargetReg);
182   }
183   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
184   MF->getRegInfo().addLiveIn(GitPtrLo);
185   MBB.addLiveIn(GitPtrLo);
186   BuildMI(MBB, I, DL, SMovB32, TargetLo)
187     .addReg(GitPtrLo);
188 }
189 
190 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
191 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
192     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
193     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
194   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
195   const SIInstrInfo *TII = ST.getInstrInfo();
196   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
197   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
198 
199   // We don't need this if we only have spills since there is no user facing
200   // scratch.
201 
202   // TODO: If we know we don't have flat instructions earlier, we can omit
203   // this from the input registers.
204   //
205   // TODO: We only need to know if we access scratch space through a flat
206   // pointer. Because we only detect if flat instructions are used at all,
207   // this will be used more often than necessary on VI.
208 
209   Register FlatScrInitLo;
210   Register FlatScrInitHi;
211 
212   if (ST.isAmdPalOS()) {
213     // Extract the scratch offset from the descriptor in the GIT
214     LivePhysRegs LiveRegs;
215     LiveRegs.init(*TRI);
216     LiveRegs.addLiveIns(MBB);
217 
218     // Find unused reg to load flat scratch init into
219     MachineRegisterInfo &MRI = MF.getRegInfo();
220     Register FlatScrInit = AMDGPU::NoRegister;
221     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
222     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
223     AllSGPR64s = AllSGPR64s.slice(
224         std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
225     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
226     for (MCPhysReg Reg : AllSGPR64s) {
227       if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
228           !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
229         FlatScrInit = Reg;
230         break;
231       }
232     }
233     assert(FlatScrInit && "Failed to find free register for scratch init");
234 
235     FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
236     FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
237 
238     buildGitPtr(MBB, I, DL, TII, FlatScrInit);
239 
240     // We now have the GIT ptr - now get the scratch descriptor from the entry
241     // at offset 0 (or offset 16 for a compute shader).
242     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
243     const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
244     auto *MMO = MF.getMachineMemOperand(
245         PtrInfo,
246         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
247             MachineMemOperand::MODereferenceable,
248         8, Align(4));
249     unsigned Offset =
250         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
251     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
252     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
253     BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
254         .addReg(FlatScrInit)
255         .addImm(EncodedOffset) // offset
256         .addImm(0)             // cpol
257         .addMemOperand(MMO);
258 
259     // Mask the offset in [47:0] of the descriptor
260     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
261     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
262         .addReg(FlatScrInitHi)
263         .addImm(0xffff);
264     And->getOperand(3).setIsDead(); // Mark SCC as dead.
265   } else {
266     Register FlatScratchInitReg =
267         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
268     assert(FlatScratchInitReg);
269 
270     MachineRegisterInfo &MRI = MF.getRegInfo();
271     MRI.addLiveIn(FlatScratchInitReg);
272     MBB.addLiveIn(FlatScratchInitReg);
273 
274     FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
275     FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
276   }
277 
278   // Do a 64-bit pointer add.
279   if (ST.flatScratchIsPointer()) {
280     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
281       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
282         .addReg(FlatScrInitLo)
283         .addReg(ScratchWaveOffsetReg);
284       auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
285                           FlatScrInitHi)
286         .addReg(FlatScrInitHi)
287         .addImm(0);
288       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
289 
290       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
291         addReg(FlatScrInitLo).
292         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
293                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
294       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
295         addReg(FlatScrInitHi).
296         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
297                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
298       return;
299     }
300 
301     // For GFX9.
302     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
303       .addReg(FlatScrInitLo)
304       .addReg(ScratchWaveOffsetReg);
305     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
306                         AMDGPU::FLAT_SCR_HI)
307       .addReg(FlatScrInitHi)
308       .addImm(0);
309     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
310 
311     return;
312   }
313 
314   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
315 
316   // Copy the size in bytes.
317   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
318     .addReg(FlatScrInitHi, RegState::Kill);
319 
320   // Add wave offset in bytes to private base offset.
321   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
322   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
323       .addReg(FlatScrInitLo)
324       .addReg(ScratchWaveOffsetReg);
325 
326   // Convert offset to 256-byte units.
327   auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
328                       AMDGPU::FLAT_SCR_HI)
329     .addReg(FlatScrInitLo, RegState::Kill)
330     .addImm(8);
331   LShr->getOperand(3).setIsDead(true); // Mark SCC as dead.
332 }
333 
334 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
335 // memory. They should have been removed by now.
336 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
337   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
338        I != E; ++I) {
339     if (!MFI.isDeadObjectIndex(I))
340       return false;
341   }
342 
343   return true;
344 }
345 
346 // Shift down registers reserved for the scratch RSRC.
347 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
348     MachineFunction &MF) const {
349 
350   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
351   const SIInstrInfo *TII = ST.getInstrInfo();
352   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
353   MachineRegisterInfo &MRI = MF.getRegInfo();
354   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
355 
356   assert(MFI->isEntryFunction());
357 
358   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
359 
360   if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
361                           allStackObjectsAreDead(MF.getFrameInfo())))
362     return Register();
363 
364   if (ST.hasSGPRInitBug() ||
365       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
366     return ScratchRsrcReg;
367 
368   // We reserved the last registers for this. Shift it down to the end of those
369   // which were actually used.
370   //
371   // FIXME: It might be safer to use a pseudoregister before replacement.
372 
373   // FIXME: We should be able to eliminate unused input registers. We only
374   // cannot do this for the resources required for scratch access. For now we
375   // skip over user SGPRs and may leave unused holes.
376 
377   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
378   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
379   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
380 
381   // Skip the last N reserved elements because they should have already been
382   // reserved for VCC etc.
383   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
384   for (MCPhysReg Reg : AllSGPR128s) {
385     // Pick the first unallocated one. Make sure we don't clobber the other
386     // reserved input we needed. Also for PAL, make sure we don't clobber
387     // the GIT pointer passed in SGPR0 or SGPR8.
388     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
389         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
390       MRI.replaceRegWith(ScratchRsrcReg, Reg);
391       MFI->setScratchRSrcReg(Reg);
392       return Reg;
393     }
394   }
395 
396   return ScratchRsrcReg;
397 }
398 
399 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
400   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
401 }
402 
403 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
404                                                 MachineBasicBlock &MBB) const {
405   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
406 
407   // FIXME: If we only have SGPR spills, we won't actually be using scratch
408   // memory since these spill to VGPRs. We should be cleaning up these unused
409   // SGPR spill frame indices somewhere.
410 
411   // FIXME: We still have implicit uses on SGPR spill instructions in case they
412   // need to spill to vector memory. It's likely that will not happen, but at
413   // this point it appears we need the setup. This part of the prolog should be
414   // emitted after frame indices are eliminated.
415 
416   // FIXME: Remove all of the isPhysRegUsed checks
417 
418   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
419   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
420   const SIInstrInfo *TII = ST.getInstrInfo();
421   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
422   MachineRegisterInfo &MRI = MF.getRegInfo();
423   const Function &F = MF.getFunction();
424   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
425 
426   assert(MFI->isEntryFunction());
427 
428   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
429       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
430 
431   // We need to do the replacement of the private segment buffer register even
432   // if there are no stack objects. There could be stores to undef or a
433   // constant without an associated object.
434   //
435   // This will return `Register()` in cases where there are no actual
436   // uses of the SRSRC.
437   Register ScratchRsrcReg;
438   if (!ST.enableFlatScratch())
439     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
440 
441   // Make the selected register live throughout the function.
442   if (ScratchRsrcReg) {
443     for (MachineBasicBlock &OtherBB : MF) {
444       if (&OtherBB != &MBB) {
445         OtherBB.addLiveIn(ScratchRsrcReg);
446       }
447     }
448   }
449 
450   // Now that we have fixed the reserved SRSRC we need to locate the
451   // (potentially) preloaded SRSRC.
452   Register PreloadedScratchRsrcReg;
453   if (ST.isAmdHsaOrMesa(F)) {
454     PreloadedScratchRsrcReg =
455         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
456     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
457       // We added live-ins during argument lowering, but since they were not
458       // used they were deleted. We're adding the uses now, so add them back.
459       MRI.addLiveIn(PreloadedScratchRsrcReg);
460       MBB.addLiveIn(PreloadedScratchRsrcReg);
461     }
462   }
463 
464   // Debug location must be unknown since the first debug location is used to
465   // determine the end of the prologue.
466   DebugLoc DL;
467   MachineBasicBlock::iterator I = MBB.begin();
468 
469   // We found the SRSRC first because it needs four registers and has an
470   // alignment requirement. If the SRSRC that we found is clobbering with
471   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
472   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
473   // wave offset to a free SGPR.
474   Register ScratchWaveOffsetReg;
475   if (PreloadedScratchWaveOffsetReg &&
476       TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
477     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
478     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
479     AllSGPRs = AllSGPRs.slice(
480         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
481     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
482     for (MCPhysReg Reg : AllSGPRs) {
483       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
484           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
485         ScratchWaveOffsetReg = Reg;
486         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
487             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
488         break;
489       }
490     }
491   } else {
492     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
493   }
494   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
495 
496   if (requiresStackPointerReference(MF)) {
497     Register SPReg = MFI->getStackPtrOffsetReg();
498     assert(SPReg != AMDGPU::SP_REG);
499     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
500         .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
501   }
502 
503   if (hasFP(MF)) {
504     Register FPReg = MFI->getFrameOffsetReg();
505     assert(FPReg != AMDGPU::FP_REG);
506     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
507   }
508 
509   bool NeedsFlatScratchInit =
510       MFI->hasFlatScratchInit() &&
511       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
512        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
513 
514   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
515       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
516     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
517     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
518   }
519 
520   if (NeedsFlatScratchInit) {
521     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
522   }
523 
524   if (ScratchRsrcReg) {
525     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
526                                          PreloadedScratchRsrcReg,
527                                          ScratchRsrcReg, ScratchWaveOffsetReg);
528   }
529 }
530 
531 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
532 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
533     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
534     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
535     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
536 
537   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
538   const SIInstrInfo *TII = ST.getInstrInfo();
539   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
540   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
541   const Function &Fn = MF.getFunction();
542 
543   if (ST.isAmdPalOS()) {
544     // The pointer to the GIT is formed from the offset passed in and either
545     // the amdgpu-git-ptr-high function attribute or the top part of the PC
546     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
547     Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
548 
549     buildGitPtr(MBB, I, DL, TII, Rsrc01);
550 
551     // We now have the GIT ptr - now get the scratch descriptor from the entry
552     // at offset 0 (or offset 16 for a compute shader).
553     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
554     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
555     auto MMO = MF.getMachineMemOperand(PtrInfo,
556                                        MachineMemOperand::MOLoad |
557                                            MachineMemOperand::MOInvariant |
558                                            MachineMemOperand::MODereferenceable,
559                                        16, Align(4));
560     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
561     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
562     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
563     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
564       .addReg(Rsrc01)
565       .addImm(EncodedOffset) // offset
566       .addImm(0) // cpol
567       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
568       .addMemOperand(MMO);
569 
570     // The driver will always set the SRD for wave 64 (bits 118:117 of
571     // descriptor / bits 22:21 of third sub-reg will be 0b11)
572     // If the shader is actually wave32 we have to modify the const_index_stride
573     // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
574     // reason the driver does this is that there can be cases where it presents
575     // 2 shaders with different wave size (e.g. VsFs).
576     // TODO: convert to using SCRATCH instructions or multiple SRD buffers
577     if (ST.isWave32()) {
578       const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
579       BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
580           .addImm(21)
581           .addReg(Rsrc03);
582     }
583   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
584     assert(!ST.isAmdHsaOrMesa(Fn));
585     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
586 
587     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
588     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
589 
590     // Use relocations to get the pointer, and setup the other bits manually.
591     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
592 
593     if (MFI->hasImplicitBufferPtr()) {
594       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
595 
596       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
597         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
598 
599         BuildMI(MBB, I, DL, Mov64, Rsrc01)
600           .addReg(MFI->getImplicitBufferPtrUserSGPR())
601           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
602       } else {
603         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
604 
605         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
606         auto MMO = MF.getMachineMemOperand(
607             PtrInfo,
608             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
609                 MachineMemOperand::MODereferenceable,
610             8, Align(4));
611         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
612           .addReg(MFI->getImplicitBufferPtrUserSGPR())
613           .addImm(0) // offset
614           .addImm(0) // cpol
615           .addMemOperand(MMO)
616           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
617 
618         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
619         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
620       }
621     } else {
622       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
623       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
624 
625       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
626         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
627         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
628 
629       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
630         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
631         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
632 
633     }
634 
635     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
636       .addImm(Rsrc23 & 0xffffffff)
637       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
638 
639     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
640       .addImm(Rsrc23 >> 32)
641       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
642   } else if (ST.isAmdHsaOrMesa(Fn)) {
643     assert(PreloadedScratchRsrcReg);
644 
645     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
646       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
647           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
648     }
649   }
650 
651   // Add the scratch wave offset into the scratch RSRC.
652   //
653   // We only want to update the first 48 bits, which is the base address
654   // pointer, without touching the adjacent 16 bits of flags. We know this add
655   // cannot carry-out from bit 47, otherwise the scratch allocation would be
656   // impossible to fit in the 48-bit global address space.
657   //
658   // TODO: Evaluate if it is better to just construct an SRD using the flat
659   // scratch init and some constants rather than update the one we are passed.
660   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
661   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
662 
663   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
664   // the kernel body via inreg arguments.
665   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
666       .addReg(ScratchRsrcSub0)
667       .addReg(ScratchWaveOffsetReg)
668       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
669   auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
670       .addReg(ScratchRsrcSub1)
671       .addImm(0)
672       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
673   Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
674 }
675 
676 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
677   switch (ID) {
678   case TargetStackID::Default:
679   case TargetStackID::NoAlloc:
680   case TargetStackID::SGPRSpill:
681     return true;
682   case TargetStackID::ScalableVector:
683   case TargetStackID::WasmLocal:
684     return false;
685   }
686   llvm_unreachable("Invalid TargetStackID::Value");
687 }
688 
689 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
690                          const SIMachineFunctionInfo *FuncInfo,
691                          MachineFunction &MF, MachineBasicBlock &MBB,
692                          MachineBasicBlock::iterator MBBI, bool IsProlog) {
693   if (LiveRegs.empty()) {
694     LiveRegs.init(TRI);
695     if (IsProlog) {
696       LiveRegs.addLiveIns(MBB);
697     } else {
698       // In epilog.
699       LiveRegs.addLiveOuts(MBB);
700       LiveRegs.stepBackward(*MBBI);
701     }
702   }
703 }
704 
705 // Activate all lanes, returns saved exec.
706 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
707                                      MachineFunction &MF,
708                                      MachineBasicBlock &MBB,
709                                      MachineBasicBlock::iterator MBBI,
710                                      bool IsProlog) {
711   Register ScratchExecCopy;
712   MachineRegisterInfo &MRI = MF.getRegInfo();
713   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
714   const SIInstrInfo *TII = ST.getInstrInfo();
715   const SIRegisterInfo &TRI = TII->getRegisterInfo();
716   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
717   DebugLoc DL;
718 
719   initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
720 
721   ScratchExecCopy = findScratchNonCalleeSaveRegister(
722       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
723   if (!ScratchExecCopy)
724     report_fatal_error("failed to find free scratch register");
725 
726   LiveRegs.addReg(ScratchExecCopy);
727 
728   const unsigned OrSaveExec =
729       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
730   auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
731     .addImm(-1);
732   SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
733 
734   return ScratchExecCopy;
735 }
736 
737 // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
738 // Otherwise we are spilling to memory.
739 static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) {
740   const MachineFrameInfo &MFI = MF.getFrameInfo();
741   return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill;
742 }
743 
744 void SIFrameLowering::emitPrologue(MachineFunction &MF,
745                                    MachineBasicBlock &MBB) const {
746   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
747   if (FuncInfo->isEntryFunction()) {
748     emitEntryFunctionPrologue(MF, MBB);
749     return;
750   }
751 
752   MachineFrameInfo &MFI = MF.getFrameInfo();
753   MachineRegisterInfo &MRI = MF.getRegInfo();
754   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
755   const SIInstrInfo *TII = ST.getInstrInfo();
756   const SIRegisterInfo &TRI = TII->getRegisterInfo();
757 
758   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
759   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
760   Register BasePtrReg =
761       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
762   LivePhysRegs LiveRegs;
763 
764   MachineBasicBlock::iterator MBBI = MBB.begin();
765   DebugLoc DL;
766 
767   bool HasFP = false;
768   bool HasBP = false;
769   uint32_t NumBytes = MFI.getStackSize();
770   uint32_t RoundedSize = NumBytes;
771   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
772   // turn on all lanes before doing the spill to memory.
773   Register ScratchExecCopy;
774 
775   Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
776   Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
777 
778   // VGPRs used for SGPR->VGPR spills
779   for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
780        FuncInfo->getSGPRSpillVGPRs()) {
781     if (!Reg.FI)
782       continue;
783 
784     if (!ScratchExecCopy)
785       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI,
786                                              /*IsProlog*/ true);
787 
788     buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, Reg.VGPR,
789                      *Reg.FI);
790   }
791 
792   for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
793     if (!ScratchExecCopy)
794       ScratchExecCopy =
795           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
796 
797     buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
798                      std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
799   }
800 
801   if (ScratchExecCopy) {
802     // FIXME: Split block and make terminator.
803     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
804     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
805     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
806         .addReg(ScratchExecCopy, RegState::Kill);
807     LiveRegs.addReg(ScratchExecCopy);
808   }
809 
810   auto SaveSGPRToMemory = [&](Register Reg, const int FI) {
811     assert(!MFI.isDeadObjectIndex(FI));
812 
813     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
814 
815     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
816         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
817     if (!TmpVGPR)
818       report_fatal_error("failed to find free scratch register");
819 
820     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
821         .addReg(Reg);
822 
823     buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
824                      FI);
825   };
826 
827   auto SaveSGPRToVGPRLane = [&](Register Reg, const int FI) {
828     assert(!MFI.isDeadObjectIndex(FI));
829 
830     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
831     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
832         FuncInfo->getSGPRToVGPRSpills(FI);
833     assert(Spill.size() == 1);
834 
835     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
836         .addReg(Reg)
837         .addImm(Spill[0].Lane)
838         .addReg(Spill[0].VGPR, RegState::Undef);
839   };
840 
841   if (FPSaveIndex) {
842     if (spilledToMemory(MF, *FPSaveIndex))
843       SaveSGPRToMemory(FramePtrReg, *FPSaveIndex);
844     else
845       SaveSGPRToVGPRLane(FramePtrReg, *FPSaveIndex);
846   }
847 
848   // Emit the copy if we need an FP, and are using a free SGPR to save it.
849   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
850     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
851             FuncInfo->SGPRForFPSaveRestoreCopy)
852         .addReg(FramePtrReg)
853         .setMIFlag(MachineInstr::FrameSetup);
854   }
855 
856   if (BPSaveIndex) {
857     if (spilledToMemory(MF, *BPSaveIndex))
858       SaveSGPRToMemory(BasePtrReg, *BPSaveIndex);
859     else
860       SaveSGPRToVGPRLane(BasePtrReg, *BPSaveIndex);
861   }
862 
863   // Emit the copy if we need a BP, and are using a free SGPR to save it.
864   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
865     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
866             FuncInfo->SGPRForBPSaveRestoreCopy)
867         .addReg(BasePtrReg)
868         .setMIFlag(MachineInstr::FrameSetup);
869   }
870 
871   // If a copy has been emitted for FP and/or BP, Make the SGPRs
872   // used in the copy instructions live throughout the function.
873   SmallVector<MCPhysReg, 2> TempSGPRs;
874   if (FuncInfo->SGPRForFPSaveRestoreCopy)
875     TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
876 
877   if (FuncInfo->SGPRForBPSaveRestoreCopy)
878     TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
879 
880   if (!TempSGPRs.empty()) {
881     for (MachineBasicBlock &MBB : MF) {
882       for (MCPhysReg Reg : TempSGPRs)
883         MBB.addLiveIn(Reg);
884 
885       MBB.sortUniqueLiveIns();
886     }
887     if (!LiveRegs.empty()) {
888       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
889       LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
890     }
891   }
892 
893   if (TRI.hasStackRealignment(MF)) {
894     HasFP = true;
895     const unsigned Alignment = MFI.getMaxAlign().value();
896 
897     RoundedSize += Alignment;
898     if (LiveRegs.empty()) {
899       LiveRegs.init(TRI);
900       LiveRegs.addLiveIns(MBB);
901     }
902 
903     // s_add_i32 s33, s32, NumBytes
904     // s_and_b32 s33, s33, 0b111...0000
905     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
906         .addReg(StackPtrReg)
907         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
908         .setMIFlag(MachineInstr::FrameSetup);
909     auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
910         .addReg(FramePtrReg, RegState::Kill)
911         .addImm(-Alignment * getScratchScaleFactor(ST))
912         .setMIFlag(MachineInstr::FrameSetup);
913     And->getOperand(3).setIsDead(); // Mark SCC as dead.
914     FuncInfo->setIsStackRealigned(true);
915   } else if ((HasFP = hasFP(MF))) {
916     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
917         .addReg(StackPtrReg)
918         .setMIFlag(MachineInstr::FrameSetup);
919   }
920 
921   // If we need a base pointer, set it up here. It's whatever the value of
922   // the stack pointer is at this point. Any variable size objects will be
923   // allocated after this, so we can still use the base pointer to reference
924   // the incoming arguments.
925   if ((HasBP = TRI.hasBasePointer(MF))) {
926     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
927         .addReg(StackPtrReg)
928         .setMIFlag(MachineInstr::FrameSetup);
929   }
930 
931   if (HasFP && RoundedSize != 0) {
932     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
933         .addReg(StackPtrReg)
934         .addImm(RoundedSize * getScratchScaleFactor(ST))
935         .setMIFlag(MachineInstr::FrameSetup);
936     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
937   }
938 
939   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
940                      FuncInfo->FramePointerSaveIndex)) &&
941          "Needed to save FP but didn't save it anywhere");
942 
943   // If we allow spilling to AGPRs we may have saved FP but then spill
944   // everything into AGPRs instead of the stack.
945   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
946                     !FuncInfo->FramePointerSaveIndex) ||
947                    EnableSpillVGPRToAGPR) &&
948          "Saved FP but didn't need it");
949 
950   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
951                      FuncInfo->BasePointerSaveIndex)) &&
952          "Needed to save BP but didn't save it anywhere");
953 
954   assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
955                     !FuncInfo->BasePointerSaveIndex)) &&
956          "Saved BP but didn't need it");
957 }
958 
959 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
960                                    MachineBasicBlock &MBB) const {
961   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
962   if (FuncInfo->isEntryFunction())
963     return;
964 
965   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
966   const SIInstrInfo *TII = ST.getInstrInfo();
967   MachineRegisterInfo &MRI = MF.getRegInfo();
968   const SIRegisterInfo &TRI = TII->getRegisterInfo();
969   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
970   LivePhysRegs LiveRegs;
971   DebugLoc DL;
972 
973   const MachineFrameInfo &MFI = MF.getFrameInfo();
974   uint32_t NumBytes = MFI.getStackSize();
975   uint32_t RoundedSize = FuncInfo->isStackRealigned()
976                              ? NumBytes + MFI.getMaxAlign().value()
977                              : NumBytes;
978   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
979   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
980   const Register BasePtrReg =
981       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
982 
983   Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
984   Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
985 
986   if (RoundedSize != 0 && hasFP(MF)) {
987     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
988         .addReg(StackPtrReg)
989         .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
990         .setMIFlag(MachineInstr::FrameDestroy);
991     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
992   }
993 
994   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
995     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
996         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
997         .setMIFlag(MachineInstr::FrameDestroy);
998   }
999 
1000   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
1001     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1002         .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
1003         .setMIFlag(MachineInstr::FrameDestroy);
1004   }
1005 
1006   auto RestoreSGPRFromMemory = [&](Register Reg, const int FI) {
1007     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1008     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
1009         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1010     if (!TmpVGPR)
1011       report_fatal_error("failed to find free scratch register");
1012     buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
1013                        FI);
1014     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), Reg)
1015         .addReg(TmpVGPR, RegState::Kill);
1016   };
1017 
1018   auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) {
1019     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1020     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
1021         FuncInfo->getSGPRToVGPRSpills(FI);
1022     assert(Spill.size() == 1);
1023     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg)
1024         .addReg(Spill[0].VGPR)
1025         .addImm(Spill[0].Lane);
1026   };
1027 
1028   if (FPSaveIndex) {
1029     const int FramePtrFI = *FPSaveIndex;
1030     assert(!MFI.isDeadObjectIndex(FramePtrFI));
1031     if (spilledToMemory(MF, FramePtrFI))
1032       RestoreSGPRFromMemory(FramePtrReg, FramePtrFI);
1033     else
1034       RestoreSGPRFromVGPRLane(FramePtrReg, FramePtrFI);
1035   }
1036 
1037   if (BPSaveIndex) {
1038     const int BasePtrFI = *BPSaveIndex;
1039     assert(!MFI.isDeadObjectIndex(BasePtrFI));
1040     if (spilledToMemory(MF, BasePtrFI))
1041       RestoreSGPRFromMemory(BasePtrReg, BasePtrFI);
1042     else
1043       RestoreSGPRFromVGPRLane(BasePtrReg, BasePtrFI);
1044   }
1045 
1046   Register ScratchExecCopy;
1047   for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
1048        FuncInfo->getSGPRSpillVGPRs()) {
1049     if (!Reg.FI)
1050       continue;
1051 
1052     if (!ScratchExecCopy)
1053       ScratchExecCopy =
1054           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
1055 
1056     buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1057                        Reg.VGPR, *Reg.FI);
1058   }
1059 
1060   for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
1061     if (!ScratchExecCopy)
1062       ScratchExecCopy =
1063           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
1064 
1065     buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1066                        std::get<0>(ReservedWWM), std::get<1>(ReservedWWM));
1067   }
1068 
1069   if (ScratchExecCopy) {
1070     // FIXME: Split block and make terminator.
1071     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1072     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1073     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1074         .addReg(ScratchExecCopy, RegState::Kill);
1075   }
1076 }
1077 
1078 #ifndef NDEBUG
1079 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1080   const MachineFrameInfo &MFI = MF.getFrameInfo();
1081   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1082   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1083        I != E; ++I) {
1084     if (!MFI.isDeadObjectIndex(I) &&
1085         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1086         (I != FuncInfo->FramePointerSaveIndex &&
1087          I != FuncInfo->BasePointerSaveIndex)) {
1088       return false;
1089     }
1090   }
1091 
1092   return true;
1093 }
1094 #endif
1095 
1096 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1097                                                     int FI,
1098                                                     Register &FrameReg) const {
1099   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1100 
1101   FrameReg = RI->getFrameRegister(MF);
1102   return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1103 }
1104 
1105 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1106   MachineFunction &MF,
1107   RegScavenger *RS) const {
1108   MachineFrameInfo &MFI = MF.getFrameInfo();
1109 
1110   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1111   const SIInstrInfo *TII = ST.getInstrInfo();
1112   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1113   MachineRegisterInfo &MRI = MF.getRegInfo();
1114   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1115 
1116   if (!FuncInfo->isEntryFunction()) {
1117     // Spill VGPRs used for Whole Wave Mode
1118     FuncInfo->allocateWWMReservedSpillSlots(MFI, *TRI);
1119   }
1120 
1121   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1122                                && EnableSpillVGPRToAGPR;
1123 
1124   if (SpillVGPRToAGPR) {
1125     // To track the spill frame indices handled in this pass.
1126     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1127     BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1128 
1129     bool SeenDbgInstr = false;
1130 
1131     for (MachineBasicBlock &MBB : MF) {
1132       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1133         int FrameIndex;
1134         if (MI.isDebugInstr())
1135           SeenDbgInstr = true;
1136 
1137         if (TII->isVGPRSpill(MI)) {
1138           // Try to eliminate stack used by VGPR spills before frame
1139           // finalization.
1140           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1141                                                      AMDGPU::OpName::vaddr);
1142           int FI = MI.getOperand(FIOp).getIndex();
1143           Register VReg =
1144             TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1145           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1146                                                 TRI->isAGPR(MRI, VReg))) {
1147             // FIXME: change to enterBasicBlockEnd()
1148             RS->enterBasicBlock(MBB);
1149             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1150             SpillFIs.set(FI);
1151             continue;
1152           }
1153         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1154                    TII->isLoadFromStackSlot(MI, FrameIndex))
1155           if (!MFI.isFixedObjectIndex(FrameIndex))
1156             NonVGPRSpillFIs.set(FrameIndex);
1157       }
1158     }
1159 
1160     // Stack slot coloring may assign different objects to the same stack slot.
1161     // If not, then the VGPR to AGPR spill slot is dead.
1162     for (unsigned FI : SpillFIs.set_bits())
1163       if (!NonVGPRSpillFIs.test(FI))
1164         FuncInfo->setVGPRToAGPRSpillDead(FI);
1165 
1166     for (MachineBasicBlock &MBB : MF) {
1167       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1168         MBB.addLiveIn(Reg);
1169 
1170       for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1171         MBB.addLiveIn(Reg);
1172 
1173       MBB.sortUniqueLiveIns();
1174 
1175       if (!SpillFIs.empty() && SeenDbgInstr) {
1176         // FIXME: The dead frame indices are replaced with a null register from
1177         // the debug value instructions. We should instead, update it with the
1178         // correct register value. But not sure the register value alone is
1179         for (MachineInstr &MI : MBB) {
1180           if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1181               SpillFIs[MI.getOperand(0).getIndex()]) {
1182             MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1183           }
1184         }
1185       }
1186     }
1187   }
1188 
1189   // At this point we've already allocated all spilled SGPRs to VGPRs if we
1190   // can. Any remaining SGPR spills will go to memory, so move them back to the
1191   // default stack.
1192   bool HaveSGPRToVMemSpill =
1193       FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1194   assert(allSGPRSpillsAreDead(MF) &&
1195          "SGPR spill should have been removed in SILowerSGPRSpills");
1196 
1197   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1198   // but currently hasNonSpillStackObjects is set only from source
1199   // allocas. Stack temps produced from legalization are not counted currently.
1200   if (!allStackObjectsAreDead(MFI)) {
1201     assert(RS && "RegScavenger required if spilling");
1202 
1203     // Add an emergency spill slot
1204     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1205 
1206     // If we are spilling SGPRs to memory with a large frame, we may need a
1207     // second VGPR emergency frame index.
1208     if (HaveSGPRToVMemSpill &&
1209         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1210       RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1211     }
1212   }
1213 }
1214 
1215 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1216     MachineFunction &MF, RegScavenger *RS) const {
1217   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1218   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1219   MachineRegisterInfo &MRI = MF.getRegInfo();
1220   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1221 
1222   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1223     // On gfx908, we had initially reserved highest available VGPR for AGPR
1224     // copy. Now since we are done with RA, check if there exist an unused VGPR
1225     // which is lower than the eariler reserved VGPR before RA. If one exist,
1226     // use it for AGPR copy instead of one reserved before RA.
1227     Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1228     Register UnusedLowVGPR =
1229         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1230     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1231                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1232       // Call to setVGPRForAGPRCopy() should happen first before calling
1233       // freezeReservedRegs() so that getReservedRegs() can reserve this newly
1234       // identified VGPR (for AGPR copy).
1235       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1236       MRI.freezeReservedRegs(MF);
1237     }
1238   }
1239 }
1240 
1241 // Only report VGPRs to generic code.
1242 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1243                                            BitVector &SavedVGPRs,
1244                                            RegScavenger *RS) const {
1245   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1246   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1247   if (MFI->isEntryFunction())
1248     return;
1249 
1250   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1251   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1252   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1253 
1254   // Ignore the SGPRs the default implementation found.
1255   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1256 
1257   // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1258   // In gfx908 there was do AGPR loads and stores and thus spilling also
1259   // require a temporary VGPR.
1260   if (!ST.hasGFX90AInsts())
1261     SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1262 
1263   // hasFP only knows about stack objects that already exist. We're now
1264   // determining the stack slots that will be created, so we have to predict
1265   // them. Stack objects force FP usage with calls.
1266   //
1267   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1268   // don't want to report it here.
1269   //
1270   // FIXME: Is this really hasReservedCallFrame?
1271   const bool WillHaveFP =
1272       FrameInfo.hasCalls() &&
1273       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1274 
1275   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1276   // so don't allow the default insertion to handle them.
1277   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1278     SavedVGPRs.reset(SSpill.VGPR);
1279 
1280   LivePhysRegs LiveRegs;
1281   LiveRegs.init(*TRI);
1282 
1283   if (WillHaveFP || hasFP(MF)) {
1284     assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex &&
1285            "Re-reserving spill slot for FP");
1286     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1287                                    MFI->FramePointerSaveIndex, true);
1288   }
1289 
1290   if (TRI->hasBasePointer(MF)) {
1291     if (MFI->SGPRForFPSaveRestoreCopy)
1292       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1293 
1294     assert(!MFI->SGPRForBPSaveRestoreCopy &&
1295            !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP");
1296     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1297                                    MFI->BasePointerSaveIndex, false);
1298   }
1299 }
1300 
1301 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1302                                                BitVector &SavedRegs,
1303                                                RegScavenger *RS) const {
1304   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1305   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1306   if (MFI->isEntryFunction())
1307     return;
1308 
1309   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1310   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1311 
1312   // The SP is specifically managed and we don't want extra spills of it.
1313   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1314 
1315   const BitVector AllSavedRegs = SavedRegs;
1316   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1317 
1318   // We have to anticipate introducing CSR VGPR spills or spill of caller
1319   // save VGPR reserved for SGPR spills as we now always create stack entry
1320   // for it, if we don't have any stack objects already, since we require a FP
1321   // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1322   // there are any SGPR spills. Whether they are CSR spills or otherwise.
1323   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1324   const bool WillHaveFP =
1325       FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1326 
1327   // FP will be specially managed like SP.
1328   if (WillHaveFP || hasFP(MF))
1329     SavedRegs.reset(MFI->getFrameOffsetReg());
1330 
1331   // Return address use with return instruction is hidden through the SI_RETURN
1332   // pseudo. Given that and since the IPRA computes actual register usage and
1333   // does not use CSR list, the clobbering of return address by function calls
1334   // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1335   // usage collection. This will ensure save/restore of return address happens
1336   // in those scenarios.
1337   const MachineRegisterInfo &MRI = MF.getRegInfo();
1338   Register RetAddrReg = TRI->getReturnAddressReg(MF);
1339   if (!MFI->isEntryFunction() &&
1340       (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1341     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1342     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1343   }
1344 }
1345 
1346 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1347     MachineFunction &MF, const TargetRegisterInfo *TRI,
1348     std::vector<CalleeSavedInfo> &CSI) const {
1349   if (CSI.empty())
1350     return true; // Early exit if no callee saved registers are modified!
1351 
1352   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1353   if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1354       !FuncInfo->SGPRForBPSaveRestoreCopy)
1355     return false;
1356 
1357   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1358   const SIRegisterInfo *RI = ST.getRegisterInfo();
1359   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1360   Register BasePtrReg = RI->getBaseRegister();
1361   unsigned NumModifiedRegs = 0;
1362 
1363   if (FuncInfo->SGPRForFPSaveRestoreCopy)
1364     NumModifiedRegs++;
1365   if (FuncInfo->SGPRForBPSaveRestoreCopy)
1366     NumModifiedRegs++;
1367 
1368   for (auto &CS : CSI) {
1369     if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1370       CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1371       if (--NumModifiedRegs)
1372         break;
1373     } else if (CS.getReg() == BasePtrReg &&
1374                FuncInfo->SGPRForBPSaveRestoreCopy) {
1375       CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1376       if (--NumModifiedRegs)
1377         break;
1378     }
1379   }
1380 
1381   return false;
1382 }
1383 
1384 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1385   const MachineFunction &MF) const {
1386 
1387   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1388   const MachineFrameInfo &MFI = MF.getFrameInfo();
1389   uint64_t EstStackSize = MFI.estimateStackSize(MF);
1390   uint64_t MaxOffset = EstStackSize - 1;
1391 
1392   // We need the emergency stack slots to be allocated in range of the
1393   // MUBUF/flat scratch immediate offset from the base register, so assign these
1394   // first at the incoming SP position.
1395   //
1396   // TODO: We could try sorting the objects to find a hole in the first bytes
1397   // rather than allocating as close to possible. This could save a lot of space
1398   // on frames with alignment requirements.
1399   if (ST.enableFlatScratch()) {
1400     const SIInstrInfo *TII = ST.getInstrInfo();
1401     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1402                                SIInstrFlags::FlatScratch))
1403       return false;
1404   } else {
1405     if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
1406       return false;
1407   }
1408 
1409   return true;
1410 }
1411 
1412 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1413   MachineFunction &MF,
1414   MachineBasicBlock &MBB,
1415   MachineBasicBlock::iterator I) const {
1416   int64_t Amount = I->getOperand(0).getImm();
1417   if (Amount == 0)
1418     return MBB.erase(I);
1419 
1420   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1421   const SIInstrInfo *TII = ST.getInstrInfo();
1422   const DebugLoc &DL = I->getDebugLoc();
1423   unsigned Opc = I->getOpcode();
1424   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1425   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1426 
1427   if (!hasReservedCallFrame(MF)) {
1428     Amount = alignTo(Amount, getStackAlign());
1429     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1430     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431     Register SPReg = MFI->getStackPtrOffsetReg();
1432 
1433     Amount *= getScratchScaleFactor(ST);
1434     if (IsDestroy)
1435       Amount = -Amount;
1436     auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1437         .addReg(SPReg)
1438         .addImm(Amount);
1439     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1440   } else if (CalleePopAmount != 0) {
1441     llvm_unreachable("is this used?");
1442   }
1443 
1444   return MBB.erase(I);
1445 }
1446 
1447 /// Returns true if the frame will require a reference to the stack pointer.
1448 ///
1449 /// This is the set of conditions common to setting up the stack pointer in a
1450 /// kernel, and for using a frame pointer in a callable function.
1451 ///
1452 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1453 /// references SP.
1454 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1455   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1456 }
1457 
1458 // The FP for kernels is always known 0, so we never really need to setup an
1459 // explicit register for it. However, DisableFramePointerElim will force us to
1460 // use a register for it.
1461 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1462   const MachineFrameInfo &MFI = MF.getFrameInfo();
1463 
1464   // For entry functions we can use an immediate offset in most cases, so the
1465   // presence of calls doesn't imply we need a distinct frame pointer.
1466   if (MFI.hasCalls() &&
1467       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1468     // All offsets are unsigned, so need to be addressed in the same direction
1469     // as stack growth.
1470 
1471     // FIXME: This function is pretty broken, since it can be called before the
1472     // frame layout is determined or CSR spills are inserted.
1473     return MFI.getStackSize() != 0;
1474   }
1475 
1476   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1477          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1478              MF) ||
1479          MF.getTarget().Options.DisableFramePointerElim(MF);
1480 }
1481 
1482 // This is essentially a reduced version of hasFP for entry functions. Since the
1483 // stack pointer is known 0 on entry to kernels, we never really need an FP
1484 // register. We may need to initialize the stack pointer depending on the frame
1485 // properties, which logically overlaps many of the cases where an ordinary
1486 // function would require an FP.
1487 bool SIFrameLowering::requiresStackPointerReference(
1488     const MachineFunction &MF) const {
1489   // Callable functions always require a stack pointer reference.
1490   assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1491          "only expected to call this for entry points");
1492 
1493   const MachineFrameInfo &MFI = MF.getFrameInfo();
1494 
1495   // Entry points ordinarily don't need to initialize SP. We have to set it up
1496   // for callees if there are any. Also note tail calls are impossible/don't
1497   // make any sense for kernels.
1498   if (MFI.hasCalls())
1499     return true;
1500 
1501   // We still need to initialize the SP if we're doing anything weird that
1502   // references the SP, like variable sized stack objects.
1503   return frameTriviallyRequiresSP(MFI);
1504 }
1505