xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (revision 02e9120893770924227138ba49df1edb3896112a)
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LivePhysRegs.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "frame-info"
22 
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24   "amdgpu-spill-vgpr-to-agpr",
25   cl::desc("Enable spilling VGPRs to AGPRs"),
26   cl::ReallyHidden,
27   cl::init(true));
28 
29 // Find a register matching \p RC from \p LiveRegs which is unused and available
30 // throughout the function. On failure, returns AMDGPU::NoRegister.
31 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
32                                      const LivePhysRegs &LiveRegs,
33                                      const TargetRegisterClass &RC) {
34   for (MCRegister Reg : RC) {
35     if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
36       return Reg;
37   }
38   return MCRegister();
39 }
40 
41 // Find a scratch register that we can use in the prologue. We avoid using
42 // callee-save registers since they may appear to be free when this is called
43 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
44 // when this is called from emitPrologue.
45 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
46                                                    LivePhysRegs &LiveRegs,
47                                                    const TargetRegisterClass &RC,
48                                                    bool Unused = false) {
49   // Mark callee saved registers as used so we will not choose them.
50   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
51   for (unsigned i = 0; CSRegs[i]; ++i)
52     LiveRegs.addReg(CSRegs[i]);
53 
54   // We are looking for a register that can be used throughout the entire
55   // function, so any use is unacceptable.
56   if (Unused)
57     return findUnusedRegister(MRI, LiveRegs, RC);
58 
59   for (MCRegister Reg : RC) {
60     if (LiveRegs.available(MRI, Reg))
61       return Reg;
62   }
63 
64   return MCRegister();
65 }
66 
67 /// Query target location for spilling SGPRs
68 /// \p IncludeScratchCopy : Also look for free scratch SGPRs
69 static void getVGPRSpillLaneOrTempRegister(
70     MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
71     const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
72     bool IncludeScratchCopy = true) {
73   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
74   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
75 
76   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
77   const SIRegisterInfo *TRI = ST.getRegisterInfo();
78   unsigned Size = TRI->getSpillSize(RC);
79   Align Alignment = TRI->getSpillAlign(RC);
80 
81   // We need to save and restore the given SGPR.
82 
83   Register ScratchSGPR;
84   // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
85   // should have all the callee saved registers marked as used. For certain
86   // cases we skip copy to scratch SGPR.
87   if (IncludeScratchCopy)
88     ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
89 
90   if (!ScratchSGPR) {
91     int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
92                                          TargetStackID::SGPRSpill);
93 
94     if (TRI->spillSGPRToVGPR() &&
95         MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
96       // 2: There's no free lane to spill, and no free register to save the
97       // SGPR, so we're forced to take another VGPR to use for the spill.
98       MFI->addToPrologEpilogSGPRSpills(
99           SGPR, PrologEpilogSGPRSaveRestoreInfo(
100                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
101 
102       LLVM_DEBUG(
103           auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
104           dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
105                  << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
106     } else {
107       // Remove dead <FI> index
108       MF.getFrameInfo().RemoveStackObject(FI);
109       // 3: If all else fails, spill the register to memory.
110       FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
111       MFI->addToPrologEpilogSGPRSpills(
112           SGPR,
113           PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
114       LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
115                         << printReg(SGPR, TRI) << '\n');
116     }
117   } else {
118     MFI->addToPrologEpilogSGPRSpills(
119         SGPR, PrologEpilogSGPRSaveRestoreInfo(
120                   SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
121     LiveRegs.addReg(ScratchSGPR);
122     LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
123                       << printReg(ScratchSGPR, TRI) << '\n');
124   }
125 }
126 
127 // We need to specially emit stack operations here because a different frame
128 // register is used than in the rest of the function, as getFrameRegister would
129 // use.
130 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
131                              const SIMachineFunctionInfo &FuncInfo,
132                              LivePhysRegs &LiveRegs, MachineFunction &MF,
133                              MachineBasicBlock &MBB,
134                              MachineBasicBlock::iterator I, const DebugLoc &DL,
135                              Register SpillReg, int FI, Register FrameReg,
136                              int64_t DwordOff = 0) {
137   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
138                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
139 
140   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
141   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
142   MachineMemOperand *MMO = MF.getMachineMemOperand(
143       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
144       FrameInfo.getObjectAlign(FI));
145   LiveRegs.addReg(SpillReg);
146   bool IsKill = !MBB.isLiveIn(SpillReg);
147   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
148                           DwordOff, MMO, nullptr, &LiveRegs);
149   if (IsKill)
150     LiveRegs.removeReg(SpillReg);
151 }
152 
153 static void buildEpilogRestore(const GCNSubtarget &ST,
154                                const SIRegisterInfo &TRI,
155                                const SIMachineFunctionInfo &FuncInfo,
156                                LivePhysRegs &LiveRegs, MachineFunction &MF,
157                                MachineBasicBlock &MBB,
158                                MachineBasicBlock::iterator I,
159                                const DebugLoc &DL, Register SpillReg, int FI,
160                                Register FrameReg, int64_t DwordOff = 0) {
161   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
162                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
163 
164   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
165   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
166   MachineMemOperand *MMO = MF.getMachineMemOperand(
167       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
168       FrameInfo.getObjectAlign(FI));
169   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
170                           DwordOff, MMO, nullptr, &LiveRegs);
171 }
172 
173 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
174                         const DebugLoc &DL, const SIInstrInfo *TII,
175                         Register TargetReg) {
176   MachineFunction *MF = MBB.getParent();
177   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
178   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
179   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
180   Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
181   Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
182 
183   if (MFI->getGITPtrHigh() != 0xffffffff) {
184     BuildMI(MBB, I, DL, SMovB32, TargetHi)
185         .addImm(MFI->getGITPtrHigh())
186         .addReg(TargetReg, RegState::ImplicitDefine);
187   } else {
188     const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
189     BuildMI(MBB, I, DL, GetPC64, TargetReg);
190   }
191   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
192   MF->getRegInfo().addLiveIn(GitPtrLo);
193   MBB.addLiveIn(GitPtrLo);
194   BuildMI(MBB, I, DL, SMovB32, TargetLo)
195     .addReg(GitPtrLo);
196 }
197 
198 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
199                          const SIMachineFunctionInfo *FuncInfo,
200                          MachineFunction &MF, MachineBasicBlock &MBB,
201                          MachineBasicBlock::iterator MBBI, bool IsProlog) {
202   if (LiveRegs.empty()) {
203     LiveRegs.init(TRI);
204     if (IsProlog) {
205       LiveRegs.addLiveIns(MBB);
206     } else {
207       // In epilog.
208       LiveRegs.addLiveOuts(MBB);
209       LiveRegs.stepBackward(*MBBI);
210     }
211   }
212 }
213 
214 namespace llvm {
215 
216 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
217 // BP, etc. These spills are delayed until the current function's frame is
218 // finalized. For a given register, the builder uses the
219 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
220 class PrologEpilogSGPRSpillBuilder {
221   MachineBasicBlock::iterator MI;
222   MachineBasicBlock &MBB;
223   MachineFunction &MF;
224   const GCNSubtarget &ST;
225   MachineFrameInfo &MFI;
226   SIMachineFunctionInfo *FuncInfo;
227   const SIInstrInfo *TII;
228   const SIRegisterInfo &TRI;
229   Register SuperReg;
230   const PrologEpilogSGPRSaveRestoreInfo SI;
231   LivePhysRegs &LiveRegs;
232   const DebugLoc &DL;
233   Register FrameReg;
234   ArrayRef<int16_t> SplitParts;
235   unsigned NumSubRegs;
236   unsigned EltSize = 4;
237 
238   void saveToMemory(const int FI) const {
239     MachineRegisterInfo &MRI = MF.getRegInfo();
240     assert(!MFI.isDeadObjectIndex(FI));
241 
242     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
243 
244     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
245         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
246     if (!TmpVGPR)
247       report_fatal_error("failed to find free scratch register");
248 
249     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
250       Register SubReg = NumSubRegs == 1
251                             ? SuperReg
252                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
253       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
254           .addReg(SubReg);
255 
256       buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
257                        FI, FrameReg, DwordOff);
258       DwordOff += 4;
259     }
260   }
261 
262   void saveToVGPRLane(const int FI) const {
263     assert(!MFI.isDeadObjectIndex(FI));
264 
265     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
266     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
267         FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
268     assert(Spill.size() == NumSubRegs);
269 
270     for (unsigned I = 0; I < NumSubRegs; ++I) {
271       Register SubReg = NumSubRegs == 1
272                             ? SuperReg
273                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
274       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
275           .addReg(SubReg)
276           .addImm(Spill[I].Lane)
277           .addReg(Spill[I].VGPR, RegState::Undef);
278     }
279   }
280 
281   void copyToScratchSGPR(Register DstReg) const {
282     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
283         .addReg(SuperReg)
284         .setMIFlag(MachineInstr::FrameSetup);
285   }
286 
287   void restoreFromMemory(const int FI) {
288     MachineRegisterInfo &MRI = MF.getRegInfo();
289 
290     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
291     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
292         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
293     if (!TmpVGPR)
294       report_fatal_error("failed to find free scratch register");
295 
296     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
297       Register SubReg = NumSubRegs == 1
298                             ? SuperReg
299                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
300 
301       buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
302                          FI, FrameReg, DwordOff);
303       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
304           .addReg(TmpVGPR, RegState::Kill);
305       DwordOff += 4;
306     }
307   }
308 
309   void restoreFromVGPRLane(const int FI) {
310     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
311     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
312         FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
313     assert(Spill.size() == NumSubRegs);
314 
315     for (unsigned I = 0; I < NumSubRegs; ++I) {
316       Register SubReg = NumSubRegs == 1
317                             ? SuperReg
318                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
319       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
320           .addReg(Spill[I].VGPR)
321           .addImm(Spill[I].Lane);
322     }
323   }
324 
325   void copyFromScratchSGPR(Register SrcReg) const {
326     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
327         .addReg(SrcReg)
328         .setMIFlag(MachineInstr::FrameDestroy);
329   }
330 
331 public:
332   PrologEpilogSGPRSpillBuilder(Register Reg,
333                                const PrologEpilogSGPRSaveRestoreInfo SI,
334                                MachineBasicBlock &MBB,
335                                MachineBasicBlock::iterator MI,
336                                const DebugLoc &DL, const SIInstrInfo *TII,
337                                const SIRegisterInfo &TRI,
338                                LivePhysRegs &LiveRegs, Register FrameReg)
339       : MI(MI), MBB(MBB), MF(*MBB.getParent()),
340         ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
341         FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
342         SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) {
343     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
344     SplitParts = TRI.getRegSplitParts(RC, EltSize);
345     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
346 
347     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
348   }
349 
350   void save() {
351     switch (SI.getKind()) {
352     case SGPRSaveKind::SPILL_TO_MEM:
353       return saveToMemory(SI.getIndex());
354     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
355       return saveToVGPRLane(SI.getIndex());
356     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
357       return copyToScratchSGPR(SI.getReg());
358     }
359   }
360 
361   void restore() {
362     switch (SI.getKind()) {
363     case SGPRSaveKind::SPILL_TO_MEM:
364       return restoreFromMemory(SI.getIndex());
365     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
366       return restoreFromVGPRLane(SI.getIndex());
367     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
368       return copyFromScratchSGPR(SI.getReg());
369     }
370   }
371 };
372 
373 } // namespace llvm
374 
375 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
376 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
377     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
378     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
379   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
380   const SIInstrInfo *TII = ST.getInstrInfo();
381   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
382   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
383 
384   // We don't need this if we only have spills since there is no user facing
385   // scratch.
386 
387   // TODO: If we know we don't have flat instructions earlier, we can omit
388   // this from the input registers.
389   //
390   // TODO: We only need to know if we access scratch space through a flat
391   // pointer. Because we only detect if flat instructions are used at all,
392   // this will be used more often than necessary on VI.
393 
394   Register FlatScrInitLo;
395   Register FlatScrInitHi;
396 
397   if (ST.isAmdPalOS()) {
398     // Extract the scratch offset from the descriptor in the GIT
399     LivePhysRegs LiveRegs;
400     LiveRegs.init(*TRI);
401     LiveRegs.addLiveIns(MBB);
402 
403     // Find unused reg to load flat scratch init into
404     MachineRegisterInfo &MRI = MF.getRegInfo();
405     Register FlatScrInit = AMDGPU::NoRegister;
406     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
407     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
408     AllSGPR64s = AllSGPR64s.slice(
409         std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
410     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
411     for (MCPhysReg Reg : AllSGPR64s) {
412       if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
413           !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
414         FlatScrInit = Reg;
415         break;
416       }
417     }
418     assert(FlatScrInit && "Failed to find free register for scratch init");
419 
420     FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
421     FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
422 
423     buildGitPtr(MBB, I, DL, TII, FlatScrInit);
424 
425     // We now have the GIT ptr - now get the scratch descriptor from the entry
426     // at offset 0 (or offset 16 for a compute shader).
427     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
428     const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
429     auto *MMO = MF.getMachineMemOperand(
430         PtrInfo,
431         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
432             MachineMemOperand::MODereferenceable,
433         8, Align(4));
434     unsigned Offset =
435         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
436     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
437     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
438     BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
439         .addReg(FlatScrInit)
440         .addImm(EncodedOffset) // offset
441         .addImm(0)             // cpol
442         .addMemOperand(MMO);
443 
444     // Mask the offset in [47:0] of the descriptor
445     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
446     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
447         .addReg(FlatScrInitHi)
448         .addImm(0xffff);
449     And->getOperand(3).setIsDead(); // Mark SCC as dead.
450   } else {
451     Register FlatScratchInitReg =
452         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
453     assert(FlatScratchInitReg);
454 
455     MachineRegisterInfo &MRI = MF.getRegInfo();
456     MRI.addLiveIn(FlatScratchInitReg);
457     MBB.addLiveIn(FlatScratchInitReg);
458 
459     FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
460     FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
461   }
462 
463   // Do a 64-bit pointer add.
464   if (ST.flatScratchIsPointer()) {
465     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
466       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
467         .addReg(FlatScrInitLo)
468         .addReg(ScratchWaveOffsetReg);
469       auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
470                           FlatScrInitHi)
471         .addReg(FlatScrInitHi)
472         .addImm(0);
473       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
474 
475       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
476         addReg(FlatScrInitLo).
477         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
478                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
479       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
480         addReg(FlatScrInitHi).
481         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
482                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
483       return;
484     }
485 
486     // For GFX9.
487     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
488       .addReg(FlatScrInitLo)
489       .addReg(ScratchWaveOffsetReg);
490     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
491                         AMDGPU::FLAT_SCR_HI)
492       .addReg(FlatScrInitHi)
493       .addImm(0);
494     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
495 
496     return;
497   }
498 
499   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
500 
501   // Copy the size in bytes.
502   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
503     .addReg(FlatScrInitHi, RegState::Kill);
504 
505   // Add wave offset in bytes to private base offset.
506   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
507   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
508       .addReg(FlatScrInitLo)
509       .addReg(ScratchWaveOffsetReg);
510 
511   // Convert offset to 256-byte units.
512   auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
513                       AMDGPU::FLAT_SCR_HI)
514     .addReg(FlatScrInitLo, RegState::Kill)
515     .addImm(8);
516   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
517 }
518 
519 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
520 // memory. They should have been removed by now.
521 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
522   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
523        I != E; ++I) {
524     if (!MFI.isDeadObjectIndex(I))
525       return false;
526   }
527 
528   return true;
529 }
530 
531 // Shift down registers reserved for the scratch RSRC.
532 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
533     MachineFunction &MF) const {
534 
535   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
536   const SIInstrInfo *TII = ST.getInstrInfo();
537   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
538   MachineRegisterInfo &MRI = MF.getRegInfo();
539   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
540 
541   assert(MFI->isEntryFunction());
542 
543   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
544 
545   if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
546                           allStackObjectsAreDead(MF.getFrameInfo())))
547     return Register();
548 
549   if (ST.hasSGPRInitBug() ||
550       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
551     return ScratchRsrcReg;
552 
553   // We reserved the last registers for this. Shift it down to the end of those
554   // which were actually used.
555   //
556   // FIXME: It might be safer to use a pseudoregister before replacement.
557 
558   // FIXME: We should be able to eliminate unused input registers. We only
559   // cannot do this for the resources required for scratch access. For now we
560   // skip over user SGPRs and may leave unused holes.
561 
562   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
563   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
564   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
565 
566   // Skip the last N reserved elements because they should have already been
567   // reserved for VCC etc.
568   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
569   for (MCPhysReg Reg : AllSGPR128s) {
570     // Pick the first unallocated one. Make sure we don't clobber the other
571     // reserved input we needed. Also for PAL, make sure we don't clobber
572     // the GIT pointer passed in SGPR0 or SGPR8.
573     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
574         (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
575       MRI.replaceRegWith(ScratchRsrcReg, Reg);
576       MFI->setScratchRSrcReg(Reg);
577       return Reg;
578     }
579   }
580 
581   return ScratchRsrcReg;
582 }
583 
584 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
585   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
586 }
587 
588 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
589                                                 MachineBasicBlock &MBB) const {
590   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
591 
592   // FIXME: If we only have SGPR spills, we won't actually be using scratch
593   // memory since these spill to VGPRs. We should be cleaning up these unused
594   // SGPR spill frame indices somewhere.
595 
596   // FIXME: We still have implicit uses on SGPR spill instructions in case they
597   // need to spill to vector memory. It's likely that will not happen, but at
598   // this point it appears we need the setup. This part of the prolog should be
599   // emitted after frame indices are eliminated.
600 
601   // FIXME: Remove all of the isPhysRegUsed checks
602 
603   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
604   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
605   const SIInstrInfo *TII = ST.getInstrInfo();
606   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
607   MachineRegisterInfo &MRI = MF.getRegInfo();
608   const Function &F = MF.getFunction();
609   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
610 
611   assert(MFI->isEntryFunction());
612 
613   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
614       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
615 
616   // We need to do the replacement of the private segment buffer register even
617   // if there are no stack objects. There could be stores to undef or a
618   // constant without an associated object.
619   //
620   // This will return `Register()` in cases where there are no actual
621   // uses of the SRSRC.
622   Register ScratchRsrcReg;
623   if (!ST.enableFlatScratch())
624     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
625 
626   // Make the selected register live throughout the function.
627   if (ScratchRsrcReg) {
628     for (MachineBasicBlock &OtherBB : MF) {
629       if (&OtherBB != &MBB) {
630         OtherBB.addLiveIn(ScratchRsrcReg);
631       }
632     }
633   }
634 
635   // Now that we have fixed the reserved SRSRC we need to locate the
636   // (potentially) preloaded SRSRC.
637   Register PreloadedScratchRsrcReg;
638   if (ST.isAmdHsaOrMesa(F)) {
639     PreloadedScratchRsrcReg =
640         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
641     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
642       // We added live-ins during argument lowering, but since they were not
643       // used they were deleted. We're adding the uses now, so add them back.
644       MRI.addLiveIn(PreloadedScratchRsrcReg);
645       MBB.addLiveIn(PreloadedScratchRsrcReg);
646     }
647   }
648 
649   // Debug location must be unknown since the first debug location is used to
650   // determine the end of the prologue.
651   DebugLoc DL;
652   MachineBasicBlock::iterator I = MBB.begin();
653 
654   // We found the SRSRC first because it needs four registers and has an
655   // alignment requirement. If the SRSRC that we found is clobbering with
656   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
657   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
658   // wave offset to a free SGPR.
659   Register ScratchWaveOffsetReg;
660   if (PreloadedScratchWaveOffsetReg &&
661       TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
662     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
663     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
664     AllSGPRs = AllSGPRs.slice(
665         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
666     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
667     for (MCPhysReg Reg : AllSGPRs) {
668       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
669           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
670         ScratchWaveOffsetReg = Reg;
671         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
672             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
673         break;
674       }
675     }
676   } else {
677     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
678   }
679   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
680 
681   if (requiresStackPointerReference(MF)) {
682     Register SPReg = MFI->getStackPtrOffsetReg();
683     assert(SPReg != AMDGPU::SP_REG);
684     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
685         .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
686   }
687 
688   if (hasFP(MF)) {
689     Register FPReg = MFI->getFrameOffsetReg();
690     assert(FPReg != AMDGPU::FP_REG);
691     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
692   }
693 
694   bool NeedsFlatScratchInit =
695       MFI->hasFlatScratchInit() &&
696       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
697        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
698 
699   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
700       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
701     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
702     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
703   }
704 
705   if (NeedsFlatScratchInit) {
706     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
707   }
708 
709   if (ScratchRsrcReg) {
710     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
711                                          PreloadedScratchRsrcReg,
712                                          ScratchRsrcReg, ScratchWaveOffsetReg);
713   }
714 }
715 
716 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
717 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
718     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
719     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
720     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
721 
722   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
723   const SIInstrInfo *TII = ST.getInstrInfo();
724   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
725   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
726   const Function &Fn = MF.getFunction();
727 
728   if (ST.isAmdPalOS()) {
729     // The pointer to the GIT is formed from the offset passed in and either
730     // the amdgpu-git-ptr-high function attribute or the top part of the PC
731     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
732     Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
733 
734     buildGitPtr(MBB, I, DL, TII, Rsrc01);
735 
736     // We now have the GIT ptr - now get the scratch descriptor from the entry
737     // at offset 0 (or offset 16 for a compute shader).
738     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
739     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
740     auto MMO = MF.getMachineMemOperand(PtrInfo,
741                                        MachineMemOperand::MOLoad |
742                                            MachineMemOperand::MOInvariant |
743                                            MachineMemOperand::MODereferenceable,
744                                        16, Align(4));
745     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
746     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
747     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
748     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
749       .addReg(Rsrc01)
750       .addImm(EncodedOffset) // offset
751       .addImm(0) // cpol
752       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
753       .addMemOperand(MMO);
754 
755     // The driver will always set the SRD for wave 64 (bits 118:117 of
756     // descriptor / bits 22:21 of third sub-reg will be 0b11)
757     // If the shader is actually wave32 we have to modify the const_index_stride
758     // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
759     // reason the driver does this is that there can be cases where it presents
760     // 2 shaders with different wave size (e.g. VsFs).
761     // TODO: convert to using SCRATCH instructions or multiple SRD buffers
762     if (ST.isWave32()) {
763       const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
764       BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
765           .addImm(21)
766           .addReg(Rsrc03);
767     }
768   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
769     assert(!ST.isAmdHsaOrMesa(Fn));
770     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
771 
772     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
773     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
774 
775     // Use relocations to get the pointer, and setup the other bits manually.
776     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
777 
778     if (MFI->hasImplicitBufferPtr()) {
779       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
780 
781       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
782         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
783 
784         BuildMI(MBB, I, DL, Mov64, Rsrc01)
785           .addReg(MFI->getImplicitBufferPtrUserSGPR())
786           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
787       } else {
788         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
789 
790         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
791         auto MMO = MF.getMachineMemOperand(
792             PtrInfo,
793             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
794                 MachineMemOperand::MODereferenceable,
795             8, Align(4));
796         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
797           .addReg(MFI->getImplicitBufferPtrUserSGPR())
798           .addImm(0) // offset
799           .addImm(0) // cpol
800           .addMemOperand(MMO)
801           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
802 
803         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
804         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
805       }
806     } else {
807       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
808       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
809 
810       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
811         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
812         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
813 
814       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
815         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
816         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
817 
818     }
819 
820     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
821       .addImm(Rsrc23 & 0xffffffff)
822       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
823 
824     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
825       .addImm(Rsrc23 >> 32)
826       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
827   } else if (ST.isAmdHsaOrMesa(Fn)) {
828     assert(PreloadedScratchRsrcReg);
829 
830     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
831       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
832           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
833     }
834   }
835 
836   // Add the scratch wave offset into the scratch RSRC.
837   //
838   // We only want to update the first 48 bits, which is the base address
839   // pointer, without touching the adjacent 16 bits of flags. We know this add
840   // cannot carry-out from bit 47, otherwise the scratch allocation would be
841   // impossible to fit in the 48-bit global address space.
842   //
843   // TODO: Evaluate if it is better to just construct an SRD using the flat
844   // scratch init and some constants rather than update the one we are passed.
845   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
846   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
847 
848   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
849   // the kernel body via inreg arguments.
850   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
851       .addReg(ScratchRsrcSub0)
852       .addReg(ScratchWaveOffsetReg)
853       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
854   auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
855       .addReg(ScratchRsrcSub1)
856       .addImm(0)
857       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
858   Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
859 }
860 
861 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
862   switch (ID) {
863   case TargetStackID::Default:
864   case TargetStackID::NoAlloc:
865   case TargetStackID::SGPRSpill:
866     return true;
867   case TargetStackID::ScalableVector:
868   case TargetStackID::WasmLocal:
869     return false;
870   }
871   llvm_unreachable("Invalid TargetStackID::Value");
872 }
873 
874 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
875 // Otherwise, activate all lanes. It returns the saved exec.
876 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
877                                      MachineFunction &MF,
878                                      MachineBasicBlock &MBB,
879                                      MachineBasicBlock::iterator MBBI,
880                                      const DebugLoc &DL, bool IsProlog,
881                                      bool EnableInactiveLanes) {
882   Register ScratchExecCopy;
883   MachineRegisterInfo &MRI = MF.getRegInfo();
884   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
885   const SIInstrInfo *TII = ST.getInstrInfo();
886   const SIRegisterInfo &TRI = TII->getRegisterInfo();
887   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
888 
889   initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
890 
891   ScratchExecCopy = findScratchNonCalleeSaveRegister(
892       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
893   if (!ScratchExecCopy)
894     report_fatal_error("failed to find free scratch register");
895 
896   LiveRegs.addReg(ScratchExecCopy);
897 
898   const unsigned SaveExecOpc =
899       ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
900                                            : AMDGPU::S_OR_SAVEEXEC_B32)
901                     : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
902                                            : AMDGPU::S_OR_SAVEEXEC_B64);
903   auto SaveExec =
904       BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
905   SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
906 
907   return ScratchExecCopy;
908 }
909 
910 void SIFrameLowering::emitCSRSpillStores(
911     MachineFunction &MF, MachineBasicBlock &MBB,
912     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
913     Register FrameReg, Register FramePtrRegScratchCopy) const {
914   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
915   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
916   const SIInstrInfo *TII = ST.getInstrInfo();
917   const SIRegisterInfo &TRI = TII->getRegisterInfo();
918 
919   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
920   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
921   // might end up flipping the EXEC bits twice.
922   Register ScratchExecCopy;
923   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
924   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
925   if (!WWMScratchRegs.empty())
926     ScratchExecCopy =
927         buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
928                              /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
929 
930   auto StoreWWMRegisters =
931       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
932         for (const auto &Reg : WWMRegs) {
933           Register VGPR = Reg.first;
934           int FI = Reg.second;
935           buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
936                            VGPR, FI, FrameReg);
937         }
938       };
939 
940   StoreWWMRegisters(WWMScratchRegs);
941   if (!WWMCalleeSavedRegs.empty()) {
942     if (ScratchExecCopy) {
943       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
944       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
945     } else {
946       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
947                                              /*IsProlog*/ true,
948                                              /*EnableInactiveLanes*/ false);
949     }
950   }
951 
952   StoreWWMRegisters(WWMCalleeSavedRegs);
953   if (ScratchExecCopy) {
954     // FIXME: Split block and make terminator.
955     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
956     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
957         .addReg(ScratchExecCopy, RegState::Kill);
958     LiveRegs.addReg(ScratchExecCopy);
959   }
960 
961   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
962 
963   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
964     // Special handle FP spill:
965     // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
966     // Otherwise, FP has been moved to a temporary register and spill it
967     // instead.
968     Register Reg =
969         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
970     if (!Reg)
971       continue;
972 
973     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
974                                     LiveRegs, FrameReg);
975     SB.save();
976   }
977 
978   // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
979   // such scratch registers live throughout the function.
980   SmallVector<Register, 1> ScratchSGPRs;
981   FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
982   if (!ScratchSGPRs.empty()) {
983     for (MachineBasicBlock &MBB : MF) {
984       for (MCPhysReg Reg : ScratchSGPRs)
985         MBB.addLiveIn(Reg);
986 
987       MBB.sortUniqueLiveIns();
988     }
989     if (!LiveRegs.empty()) {
990       for (MCPhysReg Reg : ScratchSGPRs)
991         LiveRegs.addReg(Reg);
992     }
993   }
994 }
995 
996 void SIFrameLowering::emitCSRSpillRestores(
997     MachineFunction &MF, MachineBasicBlock &MBB,
998     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
999     Register FrameReg, Register FramePtrRegScratchCopy) const {
1000   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1001   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1002   const SIInstrInfo *TII = ST.getInstrInfo();
1003   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1004   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1005 
1006   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1007     // Special handle FP restore:
1008     // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1009     // the FP value to a temporary register. The frame pointer should be
1010     // overwritten only at the end when all other spills are restored from
1011     // current frame.
1012     Register Reg =
1013         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1014     if (!Reg)
1015       continue;
1016 
1017     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1018                                     LiveRegs, FrameReg);
1019     SB.restore();
1020   }
1021 
1022   // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1023   // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1024   // this, we might end up flipping the EXEC bits twice.
1025   Register ScratchExecCopy;
1026   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1027   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1028   if (!WWMScratchRegs.empty())
1029     ScratchExecCopy =
1030         buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1031                              /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1032 
1033   auto RestoreWWMRegisters =
1034       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1035         for (const auto &Reg : WWMRegs) {
1036           Register VGPR = Reg.first;
1037           int FI = Reg.second;
1038           buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1039                              VGPR, FI, FrameReg);
1040         }
1041       };
1042 
1043   RestoreWWMRegisters(WWMScratchRegs);
1044   if (!WWMCalleeSavedRegs.empty()) {
1045     if (ScratchExecCopy) {
1046       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1047       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1048     } else {
1049       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1050                                              /*IsProlog*/ false,
1051                                              /*EnableInactiveLanes*/ false);
1052     }
1053   }
1054 
1055   RestoreWWMRegisters(WWMCalleeSavedRegs);
1056   if (ScratchExecCopy) {
1057     // FIXME: Split block and make terminator.
1058     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1059     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1060         .addReg(ScratchExecCopy, RegState::Kill);
1061   }
1062 }
1063 
1064 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1065                                    MachineBasicBlock &MBB) const {
1066   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1067   if (FuncInfo->isEntryFunction()) {
1068     emitEntryFunctionPrologue(MF, MBB);
1069     return;
1070   }
1071 
1072   MachineFrameInfo &MFI = MF.getFrameInfo();
1073   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1074   const SIInstrInfo *TII = ST.getInstrInfo();
1075   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1076   MachineRegisterInfo &MRI = MF.getRegInfo();
1077 
1078   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1079   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1080   Register BasePtrReg =
1081       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1082   LivePhysRegs LiveRegs;
1083 
1084   MachineBasicBlock::iterator MBBI = MBB.begin();
1085   // DebugLoc must be unknown since the first instruction with DebugLoc is used
1086   // to determine the end of the prologue.
1087   DebugLoc DL;
1088 
1089   bool HasFP = false;
1090   bool HasBP = false;
1091   uint32_t NumBytes = MFI.getStackSize();
1092   uint32_t RoundedSize = NumBytes;
1093 
1094   if (TRI.hasStackRealignment(MF))
1095     HasFP = true;
1096 
1097   Register FramePtrRegScratchCopy;
1098   if (!HasFP && !hasFP(MF)) {
1099     // Emit the CSR spill stores with SP base register.
1100     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1101                        FramePtrRegScratchCopy);
1102   } else {
1103     // CSR spill stores will use FP as base register.
1104     Register SGPRForFPSaveRestoreCopy =
1105         FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1106 
1107     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1108     if (SGPRForFPSaveRestoreCopy) {
1109       // Copy FP to the scratch register now and emit the CFI entry. It avoids
1110       // the extra FP copy needed in the other two cases when FP is spilled to
1111       // memory or to a VGPR lane.
1112       PrologEpilogSGPRSpillBuilder SB(
1113           FramePtrReg,
1114           FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1115           DL, TII, TRI, LiveRegs, FramePtrReg);
1116       SB.save();
1117       LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1118     } else {
1119       // Copy FP into a new scratch register so that its previous value can be
1120       // spilled after setting up the new frame.
1121       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1122           MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1123       if (!FramePtrRegScratchCopy)
1124         report_fatal_error("failed to find free scratch register");
1125 
1126       LiveRegs.addReg(FramePtrRegScratchCopy);
1127       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1128           .addReg(FramePtrReg);
1129     }
1130   }
1131 
1132   if (HasFP) {
1133     const unsigned Alignment = MFI.getMaxAlign().value();
1134 
1135     RoundedSize += Alignment;
1136     if (LiveRegs.empty()) {
1137       LiveRegs.init(TRI);
1138       LiveRegs.addLiveIns(MBB);
1139     }
1140 
1141     // s_add_i32 s33, s32, NumBytes
1142     // s_and_b32 s33, s33, 0b111...0000
1143     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1144         .addReg(StackPtrReg)
1145         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1146         .setMIFlag(MachineInstr::FrameSetup);
1147     auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1148         .addReg(FramePtrReg, RegState::Kill)
1149         .addImm(-Alignment * getScratchScaleFactor(ST))
1150         .setMIFlag(MachineInstr::FrameSetup);
1151     And->getOperand(3).setIsDead(); // Mark SCC as dead.
1152     FuncInfo->setIsStackRealigned(true);
1153   } else if ((HasFP = hasFP(MF))) {
1154     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1155         .addReg(StackPtrReg)
1156         .setMIFlag(MachineInstr::FrameSetup);
1157   }
1158 
1159   // If FP is used, emit the CSR spills with FP base register.
1160   if (HasFP) {
1161     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1162                        FramePtrRegScratchCopy);
1163     if (FramePtrRegScratchCopy)
1164       LiveRegs.removeReg(FramePtrRegScratchCopy);
1165   }
1166 
1167   // If we need a base pointer, set it up here. It's whatever the value of
1168   // the stack pointer is at this point. Any variable size objects will be
1169   // allocated after this, so we can still use the base pointer to reference
1170   // the incoming arguments.
1171   if ((HasBP = TRI.hasBasePointer(MF))) {
1172     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1173         .addReg(StackPtrReg)
1174         .setMIFlag(MachineInstr::FrameSetup);
1175   }
1176 
1177   if (HasFP && RoundedSize != 0) {
1178     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1179         .addReg(StackPtrReg)
1180         .addImm(RoundedSize * getScratchScaleFactor(ST))
1181         .setMIFlag(MachineInstr::FrameSetup);
1182     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1183   }
1184 
1185   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1186   (void)FPSaved;
1187   assert((!HasFP || FPSaved) &&
1188          "Needed to save FP but didn't save it anywhere");
1189 
1190   // If we allow spilling to AGPRs we may have saved FP but then spill
1191   // everything into AGPRs instead of the stack.
1192   assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1193          "Saved FP but didn't need it");
1194 
1195   bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1196   (void)BPSaved;
1197   assert((!HasBP || BPSaved) &&
1198          "Needed to save BP but didn't save it anywhere");
1199 
1200   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1201 }
1202 
1203 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1204                                    MachineBasicBlock &MBB) const {
1205   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1206   if (FuncInfo->isEntryFunction())
1207     return;
1208 
1209   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1210   const SIInstrInfo *TII = ST.getInstrInfo();
1211   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1212   MachineRegisterInfo &MRI = MF.getRegInfo();
1213   LivePhysRegs LiveRegs;
1214   // Get the insert location for the epilogue. If there were no terminators in
1215   // the block, get the last instruction.
1216   MachineBasicBlock::iterator MBBI = MBB.end();
1217   DebugLoc DL;
1218   if (!MBB.empty()) {
1219     MBBI = MBB.getLastNonDebugInstr();
1220     if (MBBI != MBB.end())
1221       DL = MBBI->getDebugLoc();
1222 
1223     MBBI = MBB.getFirstTerminator();
1224   }
1225 
1226   const MachineFrameInfo &MFI = MF.getFrameInfo();
1227   uint32_t NumBytes = MFI.getStackSize();
1228   uint32_t RoundedSize = FuncInfo->isStackRealigned()
1229                              ? NumBytes + MFI.getMaxAlign().value()
1230                              : NumBytes;
1231   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1232   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1233   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1234 
1235   Register FramePtrRegScratchCopy;
1236   Register SGPRForFPSaveRestoreCopy =
1237       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1238   if (FPSaved) {
1239     // CSR spill restores should use FP as base register. If
1240     // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1241     // into a new scratch register and copy to FP later when other registers are
1242     // restored from the current stack frame.
1243     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1244     if (SGPRForFPSaveRestoreCopy) {
1245       LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1246     } else {
1247       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1248           MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1249       if (!FramePtrRegScratchCopy)
1250         report_fatal_error("failed to find free scratch register");
1251 
1252       LiveRegs.addReg(FramePtrRegScratchCopy);
1253     }
1254 
1255     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1256                          FramePtrRegScratchCopy);
1257   }
1258 
1259   if (RoundedSize != 0 && hasFP(MF)) {
1260     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1261         .addReg(StackPtrReg)
1262         .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1263         .setMIFlag(MachineInstr::FrameDestroy);
1264     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1265   }
1266 
1267   if (FPSaved) {
1268     // Insert the copy to restore FP.
1269     Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1270                                                : FramePtrRegScratchCopy;
1271     MachineInstrBuilder MIB =
1272         BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1273             .addReg(SrcReg);
1274     if (SGPRForFPSaveRestoreCopy)
1275       MIB.setMIFlag(MachineInstr::FrameDestroy);
1276   } else {
1277     // Insert the CSR spill restores with SP as the base register.
1278     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1279                          FramePtrRegScratchCopy);
1280   }
1281 }
1282 
1283 #ifndef NDEBUG
1284 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1285   const MachineFrameInfo &MFI = MF.getFrameInfo();
1286   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1287   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1288        I != E; ++I) {
1289     if (!MFI.isDeadObjectIndex(I) &&
1290         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1291         !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1292       return false;
1293     }
1294   }
1295 
1296   return true;
1297 }
1298 #endif
1299 
1300 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1301                                                     int FI,
1302                                                     Register &FrameReg) const {
1303   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1304 
1305   FrameReg = RI->getFrameRegister(MF);
1306   return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1307 }
1308 
1309 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1310   MachineFunction &MF,
1311   RegScavenger *RS) const {
1312   MachineFrameInfo &MFI = MF.getFrameInfo();
1313 
1314   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1315   const SIInstrInfo *TII = ST.getInstrInfo();
1316   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1317   MachineRegisterInfo &MRI = MF.getRegInfo();
1318   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1319 
1320   // Allocate spill slots for WWM reserved VGPRs.
1321   if (!FuncInfo->isEntryFunction()) {
1322     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1323       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1324       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1325                                  TRI->getSpillAlign(*RC));
1326     }
1327   }
1328 
1329   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1330                                && EnableSpillVGPRToAGPR;
1331 
1332   if (SpillVGPRToAGPR) {
1333     // To track the spill frame indices handled in this pass.
1334     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1335     BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1336 
1337     bool SeenDbgInstr = false;
1338 
1339     for (MachineBasicBlock &MBB : MF) {
1340       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1341         int FrameIndex;
1342         if (MI.isDebugInstr())
1343           SeenDbgInstr = true;
1344 
1345         if (TII->isVGPRSpill(MI)) {
1346           // Try to eliminate stack used by VGPR spills before frame
1347           // finalization.
1348           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1349                                                      AMDGPU::OpName::vaddr);
1350           int FI = MI.getOperand(FIOp).getIndex();
1351           Register VReg =
1352             TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1353           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1354                                                 TRI->isAGPR(MRI, VReg))) {
1355             assert(RS != nullptr);
1356             // FIXME: change to enterBasicBlockEnd()
1357             RS->enterBasicBlock(MBB);
1358             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1359             SpillFIs.set(FI);
1360             continue;
1361           }
1362         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1363                    TII->isLoadFromStackSlot(MI, FrameIndex))
1364           if (!MFI.isFixedObjectIndex(FrameIndex))
1365             NonVGPRSpillFIs.set(FrameIndex);
1366       }
1367     }
1368 
1369     // Stack slot coloring may assign different objects to the same stack slot.
1370     // If not, then the VGPR to AGPR spill slot is dead.
1371     for (unsigned FI : SpillFIs.set_bits())
1372       if (!NonVGPRSpillFIs.test(FI))
1373         FuncInfo->setVGPRToAGPRSpillDead(FI);
1374 
1375     for (MachineBasicBlock &MBB : MF) {
1376       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1377         MBB.addLiveIn(Reg);
1378 
1379       for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1380         MBB.addLiveIn(Reg);
1381 
1382       MBB.sortUniqueLiveIns();
1383 
1384       if (!SpillFIs.empty() && SeenDbgInstr) {
1385         // FIXME: The dead frame indices are replaced with a null register from
1386         // the debug value instructions. We should instead, update it with the
1387         // correct register value. But not sure the register value alone is
1388         for (MachineInstr &MI : MBB) {
1389           if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1390               !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1391               SpillFIs[MI.getOperand(0).getIndex()]) {
1392             MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1393           }
1394         }
1395       }
1396     }
1397   }
1398 
1399   // At this point we've already allocated all spilled SGPRs to VGPRs if we
1400   // can. Any remaining SGPR spills will go to memory, so move them back to the
1401   // default stack.
1402   bool HaveSGPRToVMemSpill =
1403       FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1404   assert(allSGPRSpillsAreDead(MF) &&
1405          "SGPR spill should have been removed in SILowerSGPRSpills");
1406 
1407   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1408   // but currently hasNonSpillStackObjects is set only from source
1409   // allocas. Stack temps produced from legalization are not counted currently.
1410   if (!allStackObjectsAreDead(MFI)) {
1411     assert(RS && "RegScavenger required if spilling");
1412 
1413     // Add an emergency spill slot
1414     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1415 
1416     // If we are spilling SGPRs to memory with a large frame, we may need a
1417     // second VGPR emergency frame index.
1418     if (HaveSGPRToVMemSpill &&
1419         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1420       RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1421     }
1422   }
1423 }
1424 
1425 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1426     MachineFunction &MF, RegScavenger *RS) const {
1427   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1428   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1429   MachineRegisterInfo &MRI = MF.getRegInfo();
1430   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1431 
1432   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1433     // On gfx908, we had initially reserved highest available VGPR for AGPR
1434     // copy. Now since we are done with RA, check if there exist an unused VGPR
1435     // which is lower than the eariler reserved VGPR before RA. If one exist,
1436     // use it for AGPR copy instead of one reserved before RA.
1437     Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1438     Register UnusedLowVGPR =
1439         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1440     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1441                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1442       // Reserve this newly identified VGPR (for AGPR copy)
1443       // reserved registers should already be frozen at this point
1444       // so we can avoid calling MRI.freezeReservedRegs and just use
1445       // MRI.reserveReg
1446       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1447       MRI.reserveReg(UnusedLowVGPR, TRI);
1448     }
1449   }
1450   // We initally reserved the highest available SGPR pair for long branches
1451   // now, after RA, we shift down to a lower unused one if one exists
1452   Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1453   Register UnusedLowSGPR =
1454       TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1455   // If LongBranchReservedReg is null then we didn't find a long branch
1456   // and never reserved a register to begin with so there is nothing to
1457   // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1458   // register to use so just keep the original one we set.
1459   if (LongBranchReservedReg && UnusedLowSGPR) {
1460     FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1461     MRI.reserveReg(UnusedLowSGPR, TRI);
1462   }
1463 }
1464 
1465 // The special SGPR spills like the one needed for FP, BP or any reserved
1466 // registers delayed until frame lowering.
1467 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1468     MachineFunction &MF, BitVector &SavedVGPRs,
1469     bool NeedExecCopyReservedReg) const {
1470   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1471   MachineRegisterInfo &MRI = MF.getRegInfo();
1472   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1473   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1474   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1475   LivePhysRegs LiveRegs;
1476   LiveRegs.init(*TRI);
1477   // Initially mark callee saved registers as used so we will not choose them
1478   // while looking for scratch SGPRs.
1479   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1480   for (unsigned I = 0; CSRegs[I]; ++I)
1481     LiveRegs.addReg(CSRegs[I]);
1482 
1483   const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1484 
1485   if (NeedExecCopyReservedReg) {
1486     Register ReservedReg = MFI->getSGPRForEXECCopy();
1487     assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
1488     Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC);
1489     if (UnusedScratchReg) {
1490       // If found any unused scratch SGPR, reserve the register itself for Exec
1491       // copy and there is no need for any spills in that case.
1492       MFI->setSGPRForEXECCopy(UnusedScratchReg);
1493       LiveRegs.addReg(UnusedScratchReg);
1494     } else {
1495       // Needs spill.
1496       assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
1497              "Re-reserving spill slot for EXEC copy register");
1498       getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC,
1499                                      /*IncludeScratchCopy=*/false);
1500     }
1501   }
1502 
1503   // hasFP only knows about stack objects that already exist. We're now
1504   // determining the stack slots that will be created, so we have to predict
1505   // them. Stack objects force FP usage with calls.
1506   //
1507   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1508   // don't want to report it here.
1509   //
1510   // FIXME: Is this really hasReservedCallFrame?
1511   const bool WillHaveFP =
1512       FrameInfo.hasCalls() &&
1513       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1514 
1515   if (WillHaveFP || hasFP(MF)) {
1516     Register FramePtrReg = MFI->getFrameOffsetReg();
1517     assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1518            "Re-reserving spill slot for FP");
1519     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg);
1520   }
1521 
1522   if (TRI->hasBasePointer(MF)) {
1523     Register BasePtrReg = TRI->getBaseRegister();
1524     assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1525            "Re-reserving spill slot for BP");
1526     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg);
1527   }
1528 }
1529 
1530 // Only report VGPRs to generic code.
1531 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1532                                            BitVector &SavedVGPRs,
1533                                            RegScavenger *RS) const {
1534   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1535   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1536   if (MFI->isEntryFunction())
1537     return;
1538 
1539   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1540   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1541   const SIInstrInfo *TII = ST.getInstrInfo();
1542   bool NeedExecCopyReservedReg = false;
1543 
1544   MachineInstr *ReturnMI = nullptr;
1545   for (MachineBasicBlock &MBB : MF) {
1546     for (MachineInstr &MI : MBB) {
1547       // WRITELANE instructions used for SGPR spills can overwrite the inactive
1548       // lanes of VGPRs and callee must spill and restore them even if they are
1549       // marked Caller-saved.
1550 
1551       // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1552       // here would be a bad heuristic. A better way should be by calling
1553       // allocateWWMSpill during the regalloc pipeline whenever a physical
1554       // register is allocated for the intended virtual registers. That will
1555       // also help excluding the general use of WRITELANE/READLANE intrinsics
1556       // that won't really need any such special handling.
1557       if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
1558         MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1559       else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
1560         MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1561       else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1562         NeedExecCopyReservedReg = true;
1563       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1564                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
1565         // We expect all return to be the same size.
1566         assert(!ReturnMI ||
1567                (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1568                 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1569         ReturnMI = &MI;
1570       }
1571     }
1572   }
1573 
1574   // Remove any VGPRs used in the return value because these do not need to be saved.
1575   // This prevents CSR restore from clobbering return VGPRs.
1576   if (ReturnMI) {
1577     for (auto &Op : ReturnMI->operands()) {
1578       if (Op.isReg())
1579         SavedVGPRs.reset(Op.getReg());
1580     }
1581   }
1582 
1583   // Ignore the SGPRs the default implementation found.
1584   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1585 
1586   // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1587   // In gfx908 there was do AGPR loads and stores and thus spilling also
1588   // require a temporary VGPR.
1589   if (!ST.hasGFX90AInsts())
1590     SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1591 
1592   determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1593 
1594   // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1595   // allow the default insertion to handle them.
1596   for (auto &Reg : MFI->getWWMSpills())
1597     SavedVGPRs.reset(Reg.first);
1598 
1599   // Mark all lane VGPRs as BB LiveIns.
1600   for (MachineBasicBlock &MBB : MF) {
1601     for (auto &Reg : MFI->getWWMSpills())
1602       MBB.addLiveIn(Reg.first);
1603 
1604     MBB.sortUniqueLiveIns();
1605   }
1606 }
1607 
1608 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1609                                                BitVector &SavedRegs,
1610                                                RegScavenger *RS) const {
1611   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1612   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1613   if (MFI->isEntryFunction())
1614     return;
1615 
1616   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1617   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1618 
1619   // The SP is specifically managed and we don't want extra spills of it.
1620   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1621 
1622   const BitVector AllSavedRegs = SavedRegs;
1623   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1624 
1625   // We have to anticipate introducing CSR VGPR spills or spill of caller
1626   // save VGPR reserved for SGPR spills as we now always create stack entry
1627   // for it, if we don't have any stack objects already, since we require a FP
1628   // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1629   // there are any SGPR spills. Whether they are CSR spills or otherwise.
1630   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1631   const bool WillHaveFP =
1632       FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1633 
1634   // FP will be specially managed like SP.
1635   if (WillHaveFP || hasFP(MF))
1636     SavedRegs.reset(MFI->getFrameOffsetReg());
1637 
1638   // Return address use with return instruction is hidden through the SI_RETURN
1639   // pseudo. Given that and since the IPRA computes actual register usage and
1640   // does not use CSR list, the clobbering of return address by function calls
1641   // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1642   // usage collection. This will ensure save/restore of return address happens
1643   // in those scenarios.
1644   const MachineRegisterInfo &MRI = MF.getRegInfo();
1645   Register RetAddrReg = TRI->getReturnAddressReg(MF);
1646   if (!MFI->isEntryFunction() &&
1647       (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1648     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1649     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1650   }
1651 }
1652 
1653 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1654     MachineFunction &MF, const TargetRegisterInfo *TRI,
1655     std::vector<CalleeSavedInfo> &CSI) const {
1656   if (CSI.empty())
1657     return true; // Early exit if no callee saved registers are modified!
1658 
1659   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1660   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1661   const SIRegisterInfo *RI = ST.getRegisterInfo();
1662   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1663   Register BasePtrReg = RI->getBaseRegister();
1664   Register SGPRForFPSaveRestoreCopy =
1665       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1666   Register SGPRForBPSaveRestoreCopy =
1667       FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1668   if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1669     return false;
1670 
1671   unsigned NumModifiedRegs = 0;
1672 
1673   if (SGPRForFPSaveRestoreCopy)
1674     NumModifiedRegs++;
1675   if (SGPRForBPSaveRestoreCopy)
1676     NumModifiedRegs++;
1677 
1678   for (auto &CS : CSI) {
1679     if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1680       CS.setDstReg(SGPRForFPSaveRestoreCopy);
1681       if (--NumModifiedRegs)
1682         break;
1683     } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1684       CS.setDstReg(SGPRForBPSaveRestoreCopy);
1685       if (--NumModifiedRegs)
1686         break;
1687     }
1688   }
1689 
1690   return false;
1691 }
1692 
1693 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1694   const MachineFunction &MF) const {
1695 
1696   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1697   const MachineFrameInfo &MFI = MF.getFrameInfo();
1698   uint64_t EstStackSize = MFI.estimateStackSize(MF);
1699   uint64_t MaxOffset = EstStackSize - 1;
1700 
1701   // We need the emergency stack slots to be allocated in range of the
1702   // MUBUF/flat scratch immediate offset from the base register, so assign these
1703   // first at the incoming SP position.
1704   //
1705   // TODO: We could try sorting the objects to find a hole in the first bytes
1706   // rather than allocating as close to possible. This could save a lot of space
1707   // on frames with alignment requirements.
1708   if (ST.enableFlatScratch()) {
1709     const SIInstrInfo *TII = ST.getInstrInfo();
1710     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1711                                SIInstrFlags::FlatScratch))
1712       return false;
1713   } else {
1714     if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
1715       return false;
1716   }
1717 
1718   return true;
1719 }
1720 
1721 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1722   MachineFunction &MF,
1723   MachineBasicBlock &MBB,
1724   MachineBasicBlock::iterator I) const {
1725   int64_t Amount = I->getOperand(0).getImm();
1726   if (Amount == 0)
1727     return MBB.erase(I);
1728 
1729   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1730   const SIInstrInfo *TII = ST.getInstrInfo();
1731   const DebugLoc &DL = I->getDebugLoc();
1732   unsigned Opc = I->getOpcode();
1733   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1734   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1735 
1736   if (!hasReservedCallFrame(MF)) {
1737     Amount = alignTo(Amount, getStackAlign());
1738     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1739     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1740     Register SPReg = MFI->getStackPtrOffsetReg();
1741 
1742     Amount *= getScratchScaleFactor(ST);
1743     if (IsDestroy)
1744       Amount = -Amount;
1745     auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1746         .addReg(SPReg)
1747         .addImm(Amount);
1748     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1749   } else if (CalleePopAmount != 0) {
1750     llvm_unreachable("is this used?");
1751   }
1752 
1753   return MBB.erase(I);
1754 }
1755 
1756 /// Returns true if the frame will require a reference to the stack pointer.
1757 ///
1758 /// This is the set of conditions common to setting up the stack pointer in a
1759 /// kernel, and for using a frame pointer in a callable function.
1760 ///
1761 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1762 /// references SP.
1763 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1764   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1765 }
1766 
1767 // The FP for kernels is always known 0, so we never really need to setup an
1768 // explicit register for it. However, DisableFramePointerElim will force us to
1769 // use a register for it.
1770 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1771   const MachineFrameInfo &MFI = MF.getFrameInfo();
1772 
1773   // For entry functions we can use an immediate offset in most cases, so the
1774   // presence of calls doesn't imply we need a distinct frame pointer.
1775   if (MFI.hasCalls() &&
1776       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1777     // All offsets are unsigned, so need to be addressed in the same direction
1778     // as stack growth.
1779 
1780     // FIXME: This function is pretty broken, since it can be called before the
1781     // frame layout is determined or CSR spills are inserted.
1782     return MFI.getStackSize() != 0;
1783   }
1784 
1785   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1786          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1787              MF) ||
1788          MF.getTarget().Options.DisableFramePointerElim(MF);
1789 }
1790 
1791 // This is essentially a reduced version of hasFP for entry functions. Since the
1792 // stack pointer is known 0 on entry to kernels, we never really need an FP
1793 // register. We may need to initialize the stack pointer depending on the frame
1794 // properties, which logically overlaps many of the cases where an ordinary
1795 // function would require an FP.
1796 bool SIFrameLowering::requiresStackPointerReference(
1797     const MachineFunction &MF) const {
1798   // Callable functions always require a stack pointer reference.
1799   assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1800          "only expected to call this for entry points");
1801 
1802   const MachineFrameInfo &MFI = MF.getFrameInfo();
1803 
1804   // Entry points ordinarily don't need to initialize SP. We have to set it up
1805   // for callees if there are any. Also note tail calls are impossible/don't
1806   // make any sense for kernels.
1807   if (MFI.hasCalls())
1808     return true;
1809 
1810   // We still need to initialize the SP if we're doing anything weird that
1811   // references the SP, like variable sized stack objects.
1812   return frameTriviallyRequiresSP(MFI);
1813 }
1814