xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (revision 66fd12cf4896eb08ad8e7a2627537f84ead84dd3)
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LivePhysRegs.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "frame-info"
22 
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24   "amdgpu-spill-vgpr-to-agpr",
25   cl::desc("Enable spilling VGPRs to AGPRs"),
26   cl::ReallyHidden,
27   cl::init(true));
28 
29 // Find a register matching \p RC from \p LiveRegs which is unused and available
30 // throughout the function. On failure, returns AMDGPU::NoRegister.
31 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
32                                      const LivePhysRegs &LiveRegs,
33                                      const TargetRegisterClass &RC) {
34   for (MCRegister Reg : RC) {
35     if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
36       return Reg;
37   }
38   return MCRegister();
39 }
40 
41 // Find a scratch register that we can use in the prologue. We avoid using
42 // callee-save registers since they may appear to be free when this is called
43 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
44 // when this is called from emitPrologue.
45 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
46                                                    LivePhysRegs &LiveRegs,
47                                                    const TargetRegisterClass &RC,
48                                                    bool Unused = false) {
49   // Mark callee saved registers as used so we will not choose them.
50   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
51   for (unsigned i = 0; CSRegs[i]; ++i)
52     LiveRegs.addReg(CSRegs[i]);
53 
54   // We are looking for a register that can be used throughout the entire
55   // function, so any use is unacceptable.
56   if (Unused)
57     return findUnusedRegister(MRI, LiveRegs, RC);
58 
59   for (MCRegister Reg : RC) {
60     if (LiveRegs.available(MRI, Reg))
61       return Reg;
62   }
63 
64   return MCRegister();
65 }
66 
67 static void getVGPRSpillLaneOrTempRegister(
68     MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
69     const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) {
70   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
71   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
72 
73   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
74   const SIRegisterInfo *TRI = ST.getRegisterInfo();
75   unsigned Size = TRI->getSpillSize(RC);
76   Align Alignment = TRI->getSpillAlign(RC);
77 
78   // We need to save and restore the given SGPR.
79 
80   // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
81   // should have all the callee saved registers marked as used.
82   Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
83 
84   if (!ScratchSGPR) {
85     int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
86                                          TargetStackID::SGPRSpill);
87 
88     if (TRI->spillSGPRToVGPR() &&
89         MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
90       // 2: There's no free lane to spill, and no free register to save the
91       // SGPR, so we're forced to take another VGPR to use for the spill.
92       MFI->addToPrologEpilogSGPRSpills(
93           SGPR, PrologEpilogSGPRSaveRestoreInfo(
94                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
95 
96       LLVM_DEBUG(
97           auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
98           dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
99                  << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
100     } else {
101       // Remove dead <FI> index
102       MF.getFrameInfo().RemoveStackObject(FI);
103       // 3: If all else fails, spill the register to memory.
104       FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
105       MFI->addToPrologEpilogSGPRSpills(
106           SGPR,
107           PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
108       LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
109                         << printReg(SGPR, TRI) << '\n');
110     }
111   } else {
112     MFI->addToPrologEpilogSGPRSpills(
113         SGPR, PrologEpilogSGPRSaveRestoreInfo(
114                   SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
115     LiveRegs.addReg(ScratchSGPR);
116     LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
117                       << printReg(ScratchSGPR, TRI) << '\n');
118   }
119 }
120 
121 // We need to specially emit stack operations here because a different frame
122 // register is used than in the rest of the function, as getFrameRegister would
123 // use.
124 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
125                              const SIMachineFunctionInfo &FuncInfo,
126                              LivePhysRegs &LiveRegs, MachineFunction &MF,
127                              MachineBasicBlock &MBB,
128                              MachineBasicBlock::iterator I, const DebugLoc &DL,
129                              Register SpillReg, int FI, Register FrameReg,
130                              int64_t DwordOff = 0) {
131   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
132                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
133 
134   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
135   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
136   MachineMemOperand *MMO = MF.getMachineMemOperand(
137       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
138       FrameInfo.getObjectAlign(FI));
139   LiveRegs.addReg(SpillReg);
140   bool IsKill = !MBB.isLiveIn(SpillReg);
141   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
142                           DwordOff, MMO, nullptr, &LiveRegs);
143   if (IsKill)
144     LiveRegs.removeReg(SpillReg);
145 }
146 
147 static void buildEpilogRestore(const GCNSubtarget &ST,
148                                const SIRegisterInfo &TRI,
149                                const SIMachineFunctionInfo &FuncInfo,
150                                LivePhysRegs &LiveRegs, MachineFunction &MF,
151                                MachineBasicBlock &MBB,
152                                MachineBasicBlock::iterator I,
153                                const DebugLoc &DL, Register SpillReg, int FI,
154                                Register FrameReg, int64_t DwordOff = 0) {
155   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
156                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
157 
158   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
159   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
160   MachineMemOperand *MMO = MF.getMachineMemOperand(
161       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
162       FrameInfo.getObjectAlign(FI));
163   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
164                           DwordOff, MMO, nullptr, &LiveRegs);
165 }
166 
167 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
168                         const DebugLoc &DL, const SIInstrInfo *TII,
169                         Register TargetReg) {
170   MachineFunction *MF = MBB.getParent();
171   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
172   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
173   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
174   Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
175   Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
176 
177   if (MFI->getGITPtrHigh() != 0xffffffff) {
178     BuildMI(MBB, I, DL, SMovB32, TargetHi)
179         .addImm(MFI->getGITPtrHigh())
180         .addReg(TargetReg, RegState::ImplicitDefine);
181   } else {
182     const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
183     BuildMI(MBB, I, DL, GetPC64, TargetReg);
184   }
185   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
186   MF->getRegInfo().addLiveIn(GitPtrLo);
187   MBB.addLiveIn(GitPtrLo);
188   BuildMI(MBB, I, DL, SMovB32, TargetLo)
189     .addReg(GitPtrLo);
190 }
191 
192 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
193                          const SIMachineFunctionInfo *FuncInfo,
194                          MachineFunction &MF, MachineBasicBlock &MBB,
195                          MachineBasicBlock::iterator MBBI, bool IsProlog) {
196   if (LiveRegs.empty()) {
197     LiveRegs.init(TRI);
198     if (IsProlog) {
199       LiveRegs.addLiveIns(MBB);
200     } else {
201       // In epilog.
202       LiveRegs.addLiveOuts(MBB);
203       LiveRegs.stepBackward(*MBBI);
204     }
205   }
206 }
207 
208 namespace llvm {
209 
210 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
211 // BP, etc. These spills are delayed until the current function's frame is
212 // finalized. For a given register, the builder uses the
213 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
214 class PrologEpilogSGPRSpillBuilder {
215   MachineBasicBlock::iterator MI;
216   MachineBasicBlock &MBB;
217   MachineFunction &MF;
218   const GCNSubtarget &ST;
219   MachineFrameInfo &MFI;
220   SIMachineFunctionInfo *FuncInfo;
221   const SIInstrInfo *TII;
222   const SIRegisterInfo &TRI;
223   Register SuperReg;
224   const PrologEpilogSGPRSaveRestoreInfo SI;
225   LivePhysRegs &LiveRegs;
226   const DebugLoc &DL;
227   Register FrameReg;
228   ArrayRef<int16_t> SplitParts;
229   unsigned NumSubRegs;
230   unsigned EltSize = 4;
231 
232   void saveToMemory(const int FI) const {
233     MachineRegisterInfo &MRI = MF.getRegInfo();
234     assert(!MFI.isDeadObjectIndex(FI));
235 
236     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
237 
238     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
239         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
240     if (!TmpVGPR)
241       report_fatal_error("failed to find free scratch register");
242 
243     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
244       Register SubReg = NumSubRegs == 1
245                             ? SuperReg
246                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
247       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
248           .addReg(SubReg);
249 
250       buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
251                        FI, FrameReg, DwordOff);
252       DwordOff += 4;
253     }
254   }
255 
256   void saveToVGPRLane(const int FI) const {
257     assert(!MFI.isDeadObjectIndex(FI));
258 
259     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
260     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
261         FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
262     assert(Spill.size() == NumSubRegs);
263 
264     for (unsigned I = 0; I < NumSubRegs; ++I) {
265       Register SubReg = NumSubRegs == 1
266                             ? SuperReg
267                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
268       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
269           .addReg(SubReg)
270           .addImm(Spill[I].Lane)
271           .addReg(Spill[I].VGPR, RegState::Undef);
272     }
273   }
274 
275   void copyToScratchSGPR(Register DstReg) const {
276     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
277         .addReg(SuperReg)
278         .setMIFlag(MachineInstr::FrameSetup);
279   }
280 
281   void restoreFromMemory(const int FI) {
282     MachineRegisterInfo &MRI = MF.getRegInfo();
283 
284     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
285     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
286         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
287     if (!TmpVGPR)
288       report_fatal_error("failed to find free scratch register");
289 
290     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
291       Register SubReg = NumSubRegs == 1
292                             ? SuperReg
293                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
294 
295       buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
296                          FI, FrameReg, DwordOff);
297       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
298           .addReg(TmpVGPR, RegState::Kill);
299       DwordOff += 4;
300     }
301   }
302 
303   void restoreFromVGPRLane(const int FI) {
304     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
305     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
306         FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
307     assert(Spill.size() == NumSubRegs);
308 
309     for (unsigned I = 0; I < NumSubRegs; ++I) {
310       Register SubReg = NumSubRegs == 1
311                             ? SuperReg
312                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
313       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
314           .addReg(Spill[I].VGPR)
315           .addImm(Spill[I].Lane);
316     }
317   }
318 
319   void copyFromScratchSGPR(Register SrcReg) const {
320     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
321         .addReg(SrcReg)
322         .setMIFlag(MachineInstr::FrameDestroy);
323   }
324 
325 public:
326   PrologEpilogSGPRSpillBuilder(Register Reg,
327                                const PrologEpilogSGPRSaveRestoreInfo SI,
328                                MachineBasicBlock &MBB,
329                                MachineBasicBlock::iterator MI,
330                                const DebugLoc &DL, const SIInstrInfo *TII,
331                                const SIRegisterInfo &TRI,
332                                LivePhysRegs &LiveRegs, Register FrameReg)
333       : MI(MI), MBB(MBB), MF(*MBB.getParent()),
334         ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
335         FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
336         SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) {
337     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
338     SplitParts = TRI.getRegSplitParts(RC, EltSize);
339     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
340 
341     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
342   }
343 
344   void save() {
345     switch (SI.getKind()) {
346     case SGPRSaveKind::SPILL_TO_MEM:
347       return saveToMemory(SI.getIndex());
348     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
349       return saveToVGPRLane(SI.getIndex());
350     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
351       return copyToScratchSGPR(SI.getReg());
352     }
353   }
354 
355   void restore() {
356     switch (SI.getKind()) {
357     case SGPRSaveKind::SPILL_TO_MEM:
358       return restoreFromMemory(SI.getIndex());
359     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
360       return restoreFromVGPRLane(SI.getIndex());
361     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
362       return copyFromScratchSGPR(SI.getReg());
363     }
364   }
365 };
366 
367 } // namespace llvm
368 
369 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
370 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
371     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
372     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
373   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
374   const SIInstrInfo *TII = ST.getInstrInfo();
375   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
376   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
377 
378   // We don't need this if we only have spills since there is no user facing
379   // scratch.
380 
381   // TODO: If we know we don't have flat instructions earlier, we can omit
382   // this from the input registers.
383   //
384   // TODO: We only need to know if we access scratch space through a flat
385   // pointer. Because we only detect if flat instructions are used at all,
386   // this will be used more often than necessary on VI.
387 
388   Register FlatScrInitLo;
389   Register FlatScrInitHi;
390 
391   if (ST.isAmdPalOS()) {
392     // Extract the scratch offset from the descriptor in the GIT
393     LivePhysRegs LiveRegs;
394     LiveRegs.init(*TRI);
395     LiveRegs.addLiveIns(MBB);
396 
397     // Find unused reg to load flat scratch init into
398     MachineRegisterInfo &MRI = MF.getRegInfo();
399     Register FlatScrInit = AMDGPU::NoRegister;
400     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
401     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
402     AllSGPR64s = AllSGPR64s.slice(
403         std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
404     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
405     for (MCPhysReg Reg : AllSGPR64s) {
406       if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
407           !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
408         FlatScrInit = Reg;
409         break;
410       }
411     }
412     assert(FlatScrInit && "Failed to find free register for scratch init");
413 
414     FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
415     FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
416 
417     buildGitPtr(MBB, I, DL, TII, FlatScrInit);
418 
419     // We now have the GIT ptr - now get the scratch descriptor from the entry
420     // at offset 0 (or offset 16 for a compute shader).
421     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
422     const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
423     auto *MMO = MF.getMachineMemOperand(
424         PtrInfo,
425         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
426             MachineMemOperand::MODereferenceable,
427         8, Align(4));
428     unsigned Offset =
429         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
430     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
431     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
432     BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
433         .addReg(FlatScrInit)
434         .addImm(EncodedOffset) // offset
435         .addImm(0)             // cpol
436         .addMemOperand(MMO);
437 
438     // Mask the offset in [47:0] of the descriptor
439     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
440     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
441         .addReg(FlatScrInitHi)
442         .addImm(0xffff);
443     And->getOperand(3).setIsDead(); // Mark SCC as dead.
444   } else {
445     Register FlatScratchInitReg =
446         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
447     assert(FlatScratchInitReg);
448 
449     MachineRegisterInfo &MRI = MF.getRegInfo();
450     MRI.addLiveIn(FlatScratchInitReg);
451     MBB.addLiveIn(FlatScratchInitReg);
452 
453     FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
454     FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
455   }
456 
457   // Do a 64-bit pointer add.
458   if (ST.flatScratchIsPointer()) {
459     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
460       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
461         .addReg(FlatScrInitLo)
462         .addReg(ScratchWaveOffsetReg);
463       auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
464                           FlatScrInitHi)
465         .addReg(FlatScrInitHi)
466         .addImm(0);
467       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
468 
469       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
470         addReg(FlatScrInitLo).
471         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
472                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
473       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
474         addReg(FlatScrInitHi).
475         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
476                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
477       return;
478     }
479 
480     // For GFX9.
481     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
482       .addReg(FlatScrInitLo)
483       .addReg(ScratchWaveOffsetReg);
484     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
485                         AMDGPU::FLAT_SCR_HI)
486       .addReg(FlatScrInitHi)
487       .addImm(0);
488     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
489 
490     return;
491   }
492 
493   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
494 
495   // Copy the size in bytes.
496   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
497     .addReg(FlatScrInitHi, RegState::Kill);
498 
499   // Add wave offset in bytes to private base offset.
500   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
501   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
502       .addReg(FlatScrInitLo)
503       .addReg(ScratchWaveOffsetReg);
504 
505   // Convert offset to 256-byte units.
506   auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
507                       AMDGPU::FLAT_SCR_HI)
508     .addReg(FlatScrInitLo, RegState::Kill)
509     .addImm(8);
510   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
511 }
512 
513 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
514 // memory. They should have been removed by now.
515 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
516   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
517        I != E; ++I) {
518     if (!MFI.isDeadObjectIndex(I))
519       return false;
520   }
521 
522   return true;
523 }
524 
525 // Shift down registers reserved for the scratch RSRC.
526 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
527     MachineFunction &MF) const {
528 
529   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
530   const SIInstrInfo *TII = ST.getInstrInfo();
531   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
532   MachineRegisterInfo &MRI = MF.getRegInfo();
533   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
534 
535   assert(MFI->isEntryFunction());
536 
537   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
538 
539   if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
540                           allStackObjectsAreDead(MF.getFrameInfo())))
541     return Register();
542 
543   if (ST.hasSGPRInitBug() ||
544       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
545     return ScratchRsrcReg;
546 
547   // We reserved the last registers for this. Shift it down to the end of those
548   // which were actually used.
549   //
550   // FIXME: It might be safer to use a pseudoregister before replacement.
551 
552   // FIXME: We should be able to eliminate unused input registers. We only
553   // cannot do this for the resources required for scratch access. For now we
554   // skip over user SGPRs and may leave unused holes.
555 
556   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
557   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
558   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
559 
560   // Skip the last N reserved elements because they should have already been
561   // reserved for VCC etc.
562   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
563   for (MCPhysReg Reg : AllSGPR128s) {
564     // Pick the first unallocated one. Make sure we don't clobber the other
565     // reserved input we needed. Also for PAL, make sure we don't clobber
566     // the GIT pointer passed in SGPR0 or SGPR8.
567     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
568         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
569       MRI.replaceRegWith(ScratchRsrcReg, Reg);
570       MFI->setScratchRSrcReg(Reg);
571       return Reg;
572     }
573   }
574 
575   return ScratchRsrcReg;
576 }
577 
578 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
579   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
580 }
581 
582 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
583                                                 MachineBasicBlock &MBB) const {
584   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
585 
586   // FIXME: If we only have SGPR spills, we won't actually be using scratch
587   // memory since these spill to VGPRs. We should be cleaning up these unused
588   // SGPR spill frame indices somewhere.
589 
590   // FIXME: We still have implicit uses on SGPR spill instructions in case they
591   // need to spill to vector memory. It's likely that will not happen, but at
592   // this point it appears we need the setup. This part of the prolog should be
593   // emitted after frame indices are eliminated.
594 
595   // FIXME: Remove all of the isPhysRegUsed checks
596 
597   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
598   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
599   const SIInstrInfo *TII = ST.getInstrInfo();
600   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
601   MachineRegisterInfo &MRI = MF.getRegInfo();
602   const Function &F = MF.getFunction();
603   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
604 
605   assert(MFI->isEntryFunction());
606 
607   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
608       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
609 
610   // We need to do the replacement of the private segment buffer register even
611   // if there are no stack objects. There could be stores to undef or a
612   // constant without an associated object.
613   //
614   // This will return `Register()` in cases where there are no actual
615   // uses of the SRSRC.
616   Register ScratchRsrcReg;
617   if (!ST.enableFlatScratch())
618     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
619 
620   // Make the selected register live throughout the function.
621   if (ScratchRsrcReg) {
622     for (MachineBasicBlock &OtherBB : MF) {
623       if (&OtherBB != &MBB) {
624         OtherBB.addLiveIn(ScratchRsrcReg);
625       }
626     }
627   }
628 
629   // Now that we have fixed the reserved SRSRC we need to locate the
630   // (potentially) preloaded SRSRC.
631   Register PreloadedScratchRsrcReg;
632   if (ST.isAmdHsaOrMesa(F)) {
633     PreloadedScratchRsrcReg =
634         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
635     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
636       // We added live-ins during argument lowering, but since they were not
637       // used they were deleted. We're adding the uses now, so add them back.
638       MRI.addLiveIn(PreloadedScratchRsrcReg);
639       MBB.addLiveIn(PreloadedScratchRsrcReg);
640     }
641   }
642 
643   // Debug location must be unknown since the first debug location is used to
644   // determine the end of the prologue.
645   DebugLoc DL;
646   MachineBasicBlock::iterator I = MBB.begin();
647 
648   // We found the SRSRC first because it needs four registers and has an
649   // alignment requirement. If the SRSRC that we found is clobbering with
650   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
651   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
652   // wave offset to a free SGPR.
653   Register ScratchWaveOffsetReg;
654   if (PreloadedScratchWaveOffsetReg &&
655       TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
656     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
657     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
658     AllSGPRs = AllSGPRs.slice(
659         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
660     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
661     for (MCPhysReg Reg : AllSGPRs) {
662       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
663           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
664         ScratchWaveOffsetReg = Reg;
665         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
666             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
667         break;
668       }
669     }
670   } else {
671     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
672   }
673   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
674 
675   if (requiresStackPointerReference(MF)) {
676     Register SPReg = MFI->getStackPtrOffsetReg();
677     assert(SPReg != AMDGPU::SP_REG);
678     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
679         .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
680   }
681 
682   if (hasFP(MF)) {
683     Register FPReg = MFI->getFrameOffsetReg();
684     assert(FPReg != AMDGPU::FP_REG);
685     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
686   }
687 
688   bool NeedsFlatScratchInit =
689       MFI->hasFlatScratchInit() &&
690       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
691        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
692 
693   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
694       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
695     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
696     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
697   }
698 
699   if (NeedsFlatScratchInit) {
700     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
701   }
702 
703   if (ScratchRsrcReg) {
704     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
705                                          PreloadedScratchRsrcReg,
706                                          ScratchRsrcReg, ScratchWaveOffsetReg);
707   }
708 }
709 
710 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
711 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
712     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
713     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
714     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
715 
716   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
717   const SIInstrInfo *TII = ST.getInstrInfo();
718   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
719   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
720   const Function &Fn = MF.getFunction();
721 
722   if (ST.isAmdPalOS()) {
723     // The pointer to the GIT is formed from the offset passed in and either
724     // the amdgpu-git-ptr-high function attribute or the top part of the PC
725     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
726     Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
727 
728     buildGitPtr(MBB, I, DL, TII, Rsrc01);
729 
730     // We now have the GIT ptr - now get the scratch descriptor from the entry
731     // at offset 0 (or offset 16 for a compute shader).
732     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
733     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
734     auto MMO = MF.getMachineMemOperand(PtrInfo,
735                                        MachineMemOperand::MOLoad |
736                                            MachineMemOperand::MOInvariant |
737                                            MachineMemOperand::MODereferenceable,
738                                        16, Align(4));
739     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
740     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
741     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
742     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
743       .addReg(Rsrc01)
744       .addImm(EncodedOffset) // offset
745       .addImm(0) // cpol
746       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
747       .addMemOperand(MMO);
748 
749     // The driver will always set the SRD for wave 64 (bits 118:117 of
750     // descriptor / bits 22:21 of third sub-reg will be 0b11)
751     // If the shader is actually wave32 we have to modify the const_index_stride
752     // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
753     // reason the driver does this is that there can be cases where it presents
754     // 2 shaders with different wave size (e.g. VsFs).
755     // TODO: convert to using SCRATCH instructions or multiple SRD buffers
756     if (ST.isWave32()) {
757       const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
758       BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
759           .addImm(21)
760           .addReg(Rsrc03);
761     }
762   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
763     assert(!ST.isAmdHsaOrMesa(Fn));
764     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
765 
766     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
767     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
768 
769     // Use relocations to get the pointer, and setup the other bits manually.
770     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
771 
772     if (MFI->hasImplicitBufferPtr()) {
773       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
774 
775       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
776         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
777 
778         BuildMI(MBB, I, DL, Mov64, Rsrc01)
779           .addReg(MFI->getImplicitBufferPtrUserSGPR())
780           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
781       } else {
782         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
783 
784         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
785         auto MMO = MF.getMachineMemOperand(
786             PtrInfo,
787             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
788                 MachineMemOperand::MODereferenceable,
789             8, Align(4));
790         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
791           .addReg(MFI->getImplicitBufferPtrUserSGPR())
792           .addImm(0) // offset
793           .addImm(0) // cpol
794           .addMemOperand(MMO)
795           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
796 
797         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
798         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
799       }
800     } else {
801       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
802       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
803 
804       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
805         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
806         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
807 
808       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
809         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
810         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
811 
812     }
813 
814     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
815       .addImm(Rsrc23 & 0xffffffff)
816       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
817 
818     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
819       .addImm(Rsrc23 >> 32)
820       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
821   } else if (ST.isAmdHsaOrMesa(Fn)) {
822     assert(PreloadedScratchRsrcReg);
823 
824     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
825       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
826           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
827     }
828   }
829 
830   // Add the scratch wave offset into the scratch RSRC.
831   //
832   // We only want to update the first 48 bits, which is the base address
833   // pointer, without touching the adjacent 16 bits of flags. We know this add
834   // cannot carry-out from bit 47, otherwise the scratch allocation would be
835   // impossible to fit in the 48-bit global address space.
836   //
837   // TODO: Evaluate if it is better to just construct an SRD using the flat
838   // scratch init and some constants rather than update the one we are passed.
839   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
840   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
841 
842   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
843   // the kernel body via inreg arguments.
844   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
845       .addReg(ScratchRsrcSub0)
846       .addReg(ScratchWaveOffsetReg)
847       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
848   auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
849       .addReg(ScratchRsrcSub1)
850       .addImm(0)
851       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
852   Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
853 }
854 
855 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
856   switch (ID) {
857   case TargetStackID::Default:
858   case TargetStackID::NoAlloc:
859   case TargetStackID::SGPRSpill:
860     return true;
861   case TargetStackID::ScalableVector:
862   case TargetStackID::WasmLocal:
863     return false;
864   }
865   llvm_unreachable("Invalid TargetStackID::Value");
866 }
867 
868 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
869 // Otherwise, activate all lanes. It returns the saved exec.
870 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
871                                      MachineFunction &MF,
872                                      MachineBasicBlock &MBB,
873                                      MachineBasicBlock::iterator MBBI,
874                                      const DebugLoc &DL, bool IsProlog,
875                                      bool EnableInactiveLanes) {
876   Register ScratchExecCopy;
877   MachineRegisterInfo &MRI = MF.getRegInfo();
878   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
879   const SIInstrInfo *TII = ST.getInstrInfo();
880   const SIRegisterInfo &TRI = TII->getRegisterInfo();
881   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
882 
883   initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
884 
885   ScratchExecCopy = findScratchNonCalleeSaveRegister(
886       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
887   if (!ScratchExecCopy)
888     report_fatal_error("failed to find free scratch register");
889 
890   LiveRegs.addReg(ScratchExecCopy);
891 
892   const unsigned SaveExecOpc =
893       ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
894                                            : AMDGPU::S_OR_SAVEEXEC_B32)
895                     : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
896                                            : AMDGPU::S_OR_SAVEEXEC_B64);
897   auto SaveExec =
898       BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
899   SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
900 
901   return ScratchExecCopy;
902 }
903 
904 void SIFrameLowering::emitCSRSpillStores(
905     MachineFunction &MF, MachineBasicBlock &MBB,
906     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
907     Register FrameReg, Register FramePtrRegScratchCopy) const {
908   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
909   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
910   const SIInstrInfo *TII = ST.getInstrInfo();
911   const SIRegisterInfo &TRI = TII->getRegisterInfo();
912 
913   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
914   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
915   // might end up flipping the EXEC bits twice.
916   Register ScratchExecCopy;
917   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
918   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
919   if (!WWMScratchRegs.empty())
920     ScratchExecCopy =
921         buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
922                              /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
923 
924   auto StoreWWMRegisters =
925       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
926         for (const auto &Reg : WWMRegs) {
927           Register VGPR = Reg.first;
928           int FI = Reg.second;
929           buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
930                            VGPR, FI, FrameReg);
931         }
932       };
933 
934   StoreWWMRegisters(WWMScratchRegs);
935   if (!WWMCalleeSavedRegs.empty()) {
936     if (ScratchExecCopy) {
937       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
938       MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
939       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
940     } else {
941       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
942                                              /*IsProlog*/ true,
943                                              /*EnableInactiveLanes*/ false);
944     }
945   }
946 
947   StoreWWMRegisters(WWMCalleeSavedRegs);
948   if (ScratchExecCopy) {
949     // FIXME: Split block and make terminator.
950     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
951     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
952     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
953         .addReg(ScratchExecCopy, RegState::Kill);
954     LiveRegs.addReg(ScratchExecCopy);
955   }
956 
957   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
958 
959   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
960     // Special handle FP spill:
961     // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
962     // Otherwise, FP has been moved to a temporary register and spill it
963     // instead.
964     Register Reg =
965         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
966     if (!Reg)
967       continue;
968 
969     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
970                                     LiveRegs, FrameReg);
971     SB.save();
972   }
973 
974   // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
975   // such scratch registers live throughout the function.
976   SmallVector<Register, 1> ScratchSGPRs;
977   FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
978   if (!ScratchSGPRs.empty()) {
979     for (MachineBasicBlock &MBB : MF) {
980       for (MCPhysReg Reg : ScratchSGPRs)
981         MBB.addLiveIn(Reg);
982 
983       MBB.sortUniqueLiveIns();
984     }
985     if (!LiveRegs.empty()) {
986       for (MCPhysReg Reg : ScratchSGPRs)
987         LiveRegs.addReg(Reg);
988     }
989   }
990 }
991 
992 void SIFrameLowering::emitCSRSpillRestores(
993     MachineFunction &MF, MachineBasicBlock &MBB,
994     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
995     Register FrameReg, Register FramePtrRegScratchCopy) const {
996   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
997   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
998   const SIInstrInfo *TII = ST.getInstrInfo();
999   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1000   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1001 
1002   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1003     // Special handle FP restore:
1004     // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1005     // the FP value to a temporary register. The frame pointer should be
1006     // overwritten only at the end when all other spills are restored from
1007     // current frame.
1008     Register Reg =
1009         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1010     if (!Reg)
1011       continue;
1012 
1013     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1014                                     LiveRegs, FrameReg);
1015     SB.restore();
1016   }
1017 
1018   // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1019   // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1020   // this, we might end up flipping the EXEC bits twice.
1021   Register ScratchExecCopy;
1022   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1023   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1024   if (!WWMScratchRegs.empty())
1025     ScratchExecCopy =
1026         buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1027                              /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1028 
1029   auto RestoreWWMRegisters =
1030       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1031         for (const auto &Reg : WWMRegs) {
1032           Register VGPR = Reg.first;
1033           int FI = Reg.second;
1034           buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1035                              VGPR, FI, FrameReg);
1036         }
1037       };
1038 
1039   RestoreWWMRegisters(WWMScratchRegs);
1040   if (!WWMCalleeSavedRegs.empty()) {
1041     if (ScratchExecCopy) {
1042       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1043       MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1044       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
1045     } else {
1046       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1047                                              /*IsProlog*/ false,
1048                                              /*EnableInactiveLanes*/ false);
1049     }
1050   }
1051 
1052   RestoreWWMRegisters(WWMCalleeSavedRegs);
1053   if (ScratchExecCopy) {
1054     // FIXME: Split block and make terminator.
1055     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1056     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1057     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1058         .addReg(ScratchExecCopy, RegState::Kill);
1059   }
1060 }
1061 
1062 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1063                                    MachineBasicBlock &MBB) const {
1064   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1065   if (FuncInfo->isEntryFunction()) {
1066     emitEntryFunctionPrologue(MF, MBB);
1067     return;
1068   }
1069 
1070   MachineFrameInfo &MFI = MF.getFrameInfo();
1071   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1072   const SIInstrInfo *TII = ST.getInstrInfo();
1073   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1074   MachineRegisterInfo &MRI = MF.getRegInfo();
1075 
1076   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1077   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1078   Register BasePtrReg =
1079       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1080   LivePhysRegs LiveRegs;
1081 
1082   MachineBasicBlock::iterator MBBI = MBB.begin();
1083   // DebugLoc must be unknown since the first instruction with DebugLoc is used
1084   // to determine the end of the prologue.
1085   DebugLoc DL;
1086 
1087   bool HasFP = false;
1088   bool HasBP = false;
1089   uint32_t NumBytes = MFI.getStackSize();
1090   uint32_t RoundedSize = NumBytes;
1091 
1092   if (TRI.hasStackRealignment(MF))
1093     HasFP = true;
1094 
1095   Register FramePtrRegScratchCopy;
1096   if (!HasFP && !hasFP(MF)) {
1097     // Emit the CSR spill stores with SP base register.
1098     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1099                        FramePtrRegScratchCopy);
1100   } else {
1101     // CSR spill stores will use FP as base register.
1102     Register SGPRForFPSaveRestoreCopy =
1103         FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1104 
1105     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1106     if (SGPRForFPSaveRestoreCopy) {
1107       // Copy FP to the scratch register now and emit the CFI entry. It avoids
1108       // the extra FP copy needed in the other two cases when FP is spilled to
1109       // memory or to a VGPR lane.
1110       PrologEpilogSGPRSpillBuilder SB(
1111           FramePtrReg,
1112           FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1113           DL, TII, TRI, LiveRegs, FramePtrReg);
1114       SB.save();
1115       LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1116     } else {
1117       // Copy FP into a new scratch register so that its previous value can be
1118       // spilled after setting up the new frame.
1119       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1120           MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1121       if (!FramePtrRegScratchCopy)
1122         report_fatal_error("failed to find free scratch register");
1123 
1124       LiveRegs.addReg(FramePtrRegScratchCopy);
1125       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1126           .addReg(FramePtrReg);
1127     }
1128   }
1129 
1130   if (HasFP) {
1131     const unsigned Alignment = MFI.getMaxAlign().value();
1132 
1133     RoundedSize += Alignment;
1134     if (LiveRegs.empty()) {
1135       LiveRegs.init(TRI);
1136       LiveRegs.addLiveIns(MBB);
1137     }
1138 
1139     // s_add_i32 s33, s32, NumBytes
1140     // s_and_b32 s33, s33, 0b111...0000
1141     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1142         .addReg(StackPtrReg)
1143         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1144         .setMIFlag(MachineInstr::FrameSetup);
1145     auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1146         .addReg(FramePtrReg, RegState::Kill)
1147         .addImm(-Alignment * getScratchScaleFactor(ST))
1148         .setMIFlag(MachineInstr::FrameSetup);
1149     And->getOperand(3).setIsDead(); // Mark SCC as dead.
1150     FuncInfo->setIsStackRealigned(true);
1151   } else if ((HasFP = hasFP(MF))) {
1152     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1153         .addReg(StackPtrReg)
1154         .setMIFlag(MachineInstr::FrameSetup);
1155   }
1156 
1157   // If FP is used, emit the CSR spills with FP base register.
1158   if (HasFP) {
1159     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1160                        FramePtrRegScratchCopy);
1161     if (FramePtrRegScratchCopy)
1162       LiveRegs.removeReg(FramePtrRegScratchCopy);
1163   }
1164 
1165   // If we need a base pointer, set it up here. It's whatever the value of
1166   // the stack pointer is at this point. Any variable size objects will be
1167   // allocated after this, so we can still use the base pointer to reference
1168   // the incoming arguments.
1169   if ((HasBP = TRI.hasBasePointer(MF))) {
1170     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1171         .addReg(StackPtrReg)
1172         .setMIFlag(MachineInstr::FrameSetup);
1173   }
1174 
1175   if (HasFP && RoundedSize != 0) {
1176     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1177         .addReg(StackPtrReg)
1178         .addImm(RoundedSize * getScratchScaleFactor(ST))
1179         .setMIFlag(MachineInstr::FrameSetup);
1180     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1181   }
1182 
1183   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1184   (void)FPSaved;
1185   assert((!HasFP || FPSaved) &&
1186          "Needed to save FP but didn't save it anywhere");
1187 
1188   // If we allow spilling to AGPRs we may have saved FP but then spill
1189   // everything into AGPRs instead of the stack.
1190   assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1191          "Saved FP but didn't need it");
1192 
1193   bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1194   (void)BPSaved;
1195   assert((!HasBP || BPSaved) &&
1196          "Needed to save BP but didn't save it anywhere");
1197 
1198   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1199 }
1200 
1201 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1202                                    MachineBasicBlock &MBB) const {
1203   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1204   if (FuncInfo->isEntryFunction())
1205     return;
1206 
1207   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1208   const SIInstrInfo *TII = ST.getInstrInfo();
1209   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1210   MachineRegisterInfo &MRI = MF.getRegInfo();
1211   LivePhysRegs LiveRegs;
1212   // Get the insert location for the epilogue. If there were no terminators in
1213   // the block, get the last instruction.
1214   MachineBasicBlock::iterator MBBI = MBB.end();
1215   DebugLoc DL;
1216   if (!MBB.empty()) {
1217     MBBI = MBB.getLastNonDebugInstr();
1218     if (MBBI != MBB.end())
1219       DL = MBBI->getDebugLoc();
1220 
1221     MBBI = MBB.getFirstTerminator();
1222   }
1223 
1224   const MachineFrameInfo &MFI = MF.getFrameInfo();
1225   uint32_t NumBytes = MFI.getStackSize();
1226   uint32_t RoundedSize = FuncInfo->isStackRealigned()
1227                              ? NumBytes + MFI.getMaxAlign().value()
1228                              : NumBytes;
1229   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1230   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1231   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1232 
1233   Register FramePtrRegScratchCopy;
1234   Register SGPRForFPSaveRestoreCopy =
1235       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1236   if (FPSaved) {
1237     // CSR spill restores should use FP as base register. If
1238     // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1239     // into a new scratch register and copy to FP later when other registers are
1240     // restored from the current stack frame.
1241     initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1242     if (SGPRForFPSaveRestoreCopy) {
1243       LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1244     } else {
1245       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1246           MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1247       if (!FramePtrRegScratchCopy)
1248         report_fatal_error("failed to find free scratch register");
1249 
1250       LiveRegs.addReg(FramePtrRegScratchCopy);
1251     }
1252 
1253     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1254                          FramePtrRegScratchCopy);
1255   }
1256 
1257   if (RoundedSize != 0 && hasFP(MF)) {
1258     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1259         .addReg(StackPtrReg)
1260         .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1261         .setMIFlag(MachineInstr::FrameDestroy);
1262     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1263   }
1264 
1265   if (FPSaved) {
1266     // Insert the copy to restore FP.
1267     Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1268                                                : FramePtrRegScratchCopy;
1269     MachineInstrBuilder MIB =
1270         BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1271             .addReg(SrcReg);
1272     if (SGPRForFPSaveRestoreCopy)
1273       MIB.setMIFlag(MachineInstr::FrameDestroy);
1274   } else {
1275     // Insert the CSR spill restores with SP as the base register.
1276     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1277                          FramePtrRegScratchCopy);
1278   }
1279 }
1280 
1281 #ifndef NDEBUG
1282 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1283   const MachineFrameInfo &MFI = MF.getFrameInfo();
1284   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1285   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1286        I != E; ++I) {
1287     if (!MFI.isDeadObjectIndex(I) &&
1288         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1289         !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1290       return false;
1291     }
1292   }
1293 
1294   return true;
1295 }
1296 #endif
1297 
1298 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1299                                                     int FI,
1300                                                     Register &FrameReg) const {
1301   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1302 
1303   FrameReg = RI->getFrameRegister(MF);
1304   return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1305 }
1306 
1307 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1308   MachineFunction &MF,
1309   RegScavenger *RS) const {
1310   MachineFrameInfo &MFI = MF.getFrameInfo();
1311 
1312   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1313   const SIInstrInfo *TII = ST.getInstrInfo();
1314   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1315   MachineRegisterInfo &MRI = MF.getRegInfo();
1316   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1317 
1318   // Allocate spill slots for WWM reserved VGPRs.
1319   if (!FuncInfo->isEntryFunction()) {
1320     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1321       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1322       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1323                                  TRI->getSpillAlign(*RC));
1324     }
1325   }
1326 
1327   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1328                                && EnableSpillVGPRToAGPR;
1329 
1330   if (SpillVGPRToAGPR) {
1331     // To track the spill frame indices handled in this pass.
1332     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1333     BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1334 
1335     bool SeenDbgInstr = false;
1336 
1337     for (MachineBasicBlock &MBB : MF) {
1338       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1339         int FrameIndex;
1340         if (MI.isDebugInstr())
1341           SeenDbgInstr = true;
1342 
1343         if (TII->isVGPRSpill(MI)) {
1344           // Try to eliminate stack used by VGPR spills before frame
1345           // finalization.
1346           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1347                                                      AMDGPU::OpName::vaddr);
1348           int FI = MI.getOperand(FIOp).getIndex();
1349           Register VReg =
1350             TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1351           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1352                                                 TRI->isAGPR(MRI, VReg))) {
1353             // FIXME: change to enterBasicBlockEnd()
1354             RS->enterBasicBlock(MBB);
1355             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1356             SpillFIs.set(FI);
1357             continue;
1358           }
1359         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1360                    TII->isLoadFromStackSlot(MI, FrameIndex))
1361           if (!MFI.isFixedObjectIndex(FrameIndex))
1362             NonVGPRSpillFIs.set(FrameIndex);
1363       }
1364     }
1365 
1366     // Stack slot coloring may assign different objects to the same stack slot.
1367     // If not, then the VGPR to AGPR spill slot is dead.
1368     for (unsigned FI : SpillFIs.set_bits())
1369       if (!NonVGPRSpillFIs.test(FI))
1370         FuncInfo->setVGPRToAGPRSpillDead(FI);
1371 
1372     for (MachineBasicBlock &MBB : MF) {
1373       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1374         MBB.addLiveIn(Reg);
1375 
1376       for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1377         MBB.addLiveIn(Reg);
1378 
1379       MBB.sortUniqueLiveIns();
1380 
1381       if (!SpillFIs.empty() && SeenDbgInstr) {
1382         // FIXME: The dead frame indices are replaced with a null register from
1383         // the debug value instructions. We should instead, update it with the
1384         // correct register value. But not sure the register value alone is
1385         for (MachineInstr &MI : MBB) {
1386           if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1387               !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1388               SpillFIs[MI.getOperand(0).getIndex()]) {
1389             MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1390           }
1391         }
1392       }
1393     }
1394   }
1395 
1396   // At this point we've already allocated all spilled SGPRs to VGPRs if we
1397   // can. Any remaining SGPR spills will go to memory, so move them back to the
1398   // default stack.
1399   bool HaveSGPRToVMemSpill =
1400       FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1401   assert(allSGPRSpillsAreDead(MF) &&
1402          "SGPR spill should have been removed in SILowerSGPRSpills");
1403 
1404   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1405   // but currently hasNonSpillStackObjects is set only from source
1406   // allocas. Stack temps produced from legalization are not counted currently.
1407   if (!allStackObjectsAreDead(MFI)) {
1408     assert(RS && "RegScavenger required if spilling");
1409 
1410     // Add an emergency spill slot
1411     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1412 
1413     // If we are spilling SGPRs to memory with a large frame, we may need a
1414     // second VGPR emergency frame index.
1415     if (HaveSGPRToVMemSpill &&
1416         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1417       RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1418     }
1419   }
1420 }
1421 
1422 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1423     MachineFunction &MF, RegScavenger *RS) const {
1424   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1425   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1426   MachineRegisterInfo &MRI = MF.getRegInfo();
1427   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1428 
1429   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1430     // On gfx908, we had initially reserved highest available VGPR for AGPR
1431     // copy. Now since we are done with RA, check if there exist an unused VGPR
1432     // which is lower than the eariler reserved VGPR before RA. If one exist,
1433     // use it for AGPR copy instead of one reserved before RA.
1434     Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1435     Register UnusedLowVGPR =
1436         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1437     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1438                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1439       // Call to setVGPRForAGPRCopy() should happen first before calling
1440       // freezeReservedRegs() so that getReservedRegs() can reserve this newly
1441       // identified VGPR (for AGPR copy).
1442       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1443       MRI.freezeReservedRegs(MF);
1444     }
1445   }
1446 }
1447 
1448 // The special SGPR spills like the one needed for FP, BP or any reserved
1449 // registers delayed until frame lowering.
1450 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1451     MachineFunction &MF, BitVector &SavedVGPRs) const {
1452   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1453   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1454   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1455   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1456   LivePhysRegs LiveRegs;
1457   LiveRegs.init(*TRI);
1458   // Initially mark callee saved registers as used so we will not choose them
1459   // while looking for scratch SGPRs.
1460   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1461   for (unsigned I = 0; CSRegs[I]; ++I)
1462     LiveRegs.addReg(CSRegs[I]);
1463 
1464   // hasFP only knows about stack objects that already exist. We're now
1465   // determining the stack slots that will be created, so we have to predict
1466   // them. Stack objects force FP usage with calls.
1467   //
1468   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1469   // don't want to report it here.
1470   //
1471   // FIXME: Is this really hasReservedCallFrame?
1472   const bool WillHaveFP =
1473       FrameInfo.hasCalls() &&
1474       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1475 
1476   if (WillHaveFP || hasFP(MF)) {
1477     Register FramePtrReg = MFI->getFrameOffsetReg();
1478     assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1479            "Re-reserving spill slot for FP");
1480     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg);
1481   }
1482 
1483   if (TRI->hasBasePointer(MF)) {
1484     Register BasePtrReg = TRI->getBaseRegister();
1485     assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1486            "Re-reserving spill slot for BP");
1487     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg);
1488   }
1489 }
1490 
1491 // Only report VGPRs to generic code.
1492 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1493                                            BitVector &SavedVGPRs,
1494                                            RegScavenger *RS) const {
1495   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1496   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1497   if (MFI->isEntryFunction())
1498     return;
1499 
1500   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1501   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1502 
1503   for (MachineBasicBlock &MBB : MF) {
1504     for (MachineInstr &MI : MBB) {
1505       // WRITELANE instructions used for SGPR spills can overwrite the inactive
1506       // lanes of VGPRs and callee must spill and restore them even if they are
1507       // marked Caller-saved.
1508 
1509       // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1510       // here would be a bad heuristic. A better way should be by calling
1511       // allocateWWMSpill during the regalloc pipeline whenever a physical
1512       // register is allocated for the intended virtual registers. That will
1513       // also help excluding the general use of WRITELANE/READLANE intrinsics
1514       // that won't really need any such special handling.
1515       if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
1516         MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1517       else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
1518         MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1519     }
1520   }
1521 
1522   // Ignore the SGPRs the default implementation found.
1523   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1524 
1525   // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1526   // In gfx908 there was do AGPR loads and stores and thus spilling also
1527   // require a temporary VGPR.
1528   if (!ST.hasGFX90AInsts())
1529     SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1530 
1531   determinePrologEpilogSGPRSaves(MF, SavedVGPRs);
1532 
1533   // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1534   // allow the default insertion to handle them.
1535   for (auto &Reg : MFI->getWWMSpills())
1536     SavedVGPRs.reset(Reg.first);
1537 
1538   // Mark all lane VGPRs as BB LiveIns.
1539   for (MachineBasicBlock &MBB : MF) {
1540     for (auto &Reg : MFI->getWWMSpills())
1541       MBB.addLiveIn(Reg.first);
1542 
1543     MBB.sortUniqueLiveIns();
1544   }
1545 }
1546 
1547 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1548                                                BitVector &SavedRegs,
1549                                                RegScavenger *RS) const {
1550   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1551   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1552   if (MFI->isEntryFunction())
1553     return;
1554 
1555   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1556   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1557 
1558   // The SP is specifically managed and we don't want extra spills of it.
1559   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1560 
1561   const BitVector AllSavedRegs = SavedRegs;
1562   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1563 
1564   // We have to anticipate introducing CSR VGPR spills or spill of caller
1565   // save VGPR reserved for SGPR spills as we now always create stack entry
1566   // for it, if we don't have any stack objects already, since we require a FP
1567   // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1568   // there are any SGPR spills. Whether they are CSR spills or otherwise.
1569   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1570   const bool WillHaveFP =
1571       FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1572 
1573   // FP will be specially managed like SP.
1574   if (WillHaveFP || hasFP(MF))
1575     SavedRegs.reset(MFI->getFrameOffsetReg());
1576 
1577   // Return address use with return instruction is hidden through the SI_RETURN
1578   // pseudo. Given that and since the IPRA computes actual register usage and
1579   // does not use CSR list, the clobbering of return address by function calls
1580   // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1581   // usage collection. This will ensure save/restore of return address happens
1582   // in those scenarios.
1583   const MachineRegisterInfo &MRI = MF.getRegInfo();
1584   Register RetAddrReg = TRI->getReturnAddressReg(MF);
1585   if (!MFI->isEntryFunction() &&
1586       (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1587     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1588     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1589   }
1590 }
1591 
1592 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1593     MachineFunction &MF, const TargetRegisterInfo *TRI,
1594     std::vector<CalleeSavedInfo> &CSI) const {
1595   if (CSI.empty())
1596     return true; // Early exit if no callee saved registers are modified!
1597 
1598   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1599   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1600   const SIRegisterInfo *RI = ST.getRegisterInfo();
1601   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1602   Register BasePtrReg = RI->getBaseRegister();
1603   Register SGPRForFPSaveRestoreCopy =
1604       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1605   Register SGPRForBPSaveRestoreCopy =
1606       FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1607   if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1608     return false;
1609 
1610   unsigned NumModifiedRegs = 0;
1611 
1612   if (SGPRForFPSaveRestoreCopy)
1613     NumModifiedRegs++;
1614   if (SGPRForBPSaveRestoreCopy)
1615     NumModifiedRegs++;
1616 
1617   for (auto &CS : CSI) {
1618     if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1619       CS.setDstReg(SGPRForFPSaveRestoreCopy);
1620       if (--NumModifiedRegs)
1621         break;
1622     } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1623       CS.setDstReg(SGPRForBPSaveRestoreCopy);
1624       if (--NumModifiedRegs)
1625         break;
1626     }
1627   }
1628 
1629   return false;
1630 }
1631 
1632 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1633   const MachineFunction &MF) const {
1634 
1635   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1636   const MachineFrameInfo &MFI = MF.getFrameInfo();
1637   uint64_t EstStackSize = MFI.estimateStackSize(MF);
1638   uint64_t MaxOffset = EstStackSize - 1;
1639 
1640   // We need the emergency stack slots to be allocated in range of the
1641   // MUBUF/flat scratch immediate offset from the base register, so assign these
1642   // first at the incoming SP position.
1643   //
1644   // TODO: We could try sorting the objects to find a hole in the first bytes
1645   // rather than allocating as close to possible. This could save a lot of space
1646   // on frames with alignment requirements.
1647   if (ST.enableFlatScratch()) {
1648     const SIInstrInfo *TII = ST.getInstrInfo();
1649     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1650                                SIInstrFlags::FlatScratch))
1651       return false;
1652   } else {
1653     if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
1654       return false;
1655   }
1656 
1657   return true;
1658 }
1659 
1660 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1661   MachineFunction &MF,
1662   MachineBasicBlock &MBB,
1663   MachineBasicBlock::iterator I) const {
1664   int64_t Amount = I->getOperand(0).getImm();
1665   if (Amount == 0)
1666     return MBB.erase(I);
1667 
1668   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1669   const SIInstrInfo *TII = ST.getInstrInfo();
1670   const DebugLoc &DL = I->getDebugLoc();
1671   unsigned Opc = I->getOpcode();
1672   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1673   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1674 
1675   if (!hasReservedCallFrame(MF)) {
1676     Amount = alignTo(Amount, getStackAlign());
1677     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1678     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1679     Register SPReg = MFI->getStackPtrOffsetReg();
1680 
1681     Amount *= getScratchScaleFactor(ST);
1682     if (IsDestroy)
1683       Amount = -Amount;
1684     auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1685         .addReg(SPReg)
1686         .addImm(Amount);
1687     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1688   } else if (CalleePopAmount != 0) {
1689     llvm_unreachable("is this used?");
1690   }
1691 
1692   return MBB.erase(I);
1693 }
1694 
1695 /// Returns true if the frame will require a reference to the stack pointer.
1696 ///
1697 /// This is the set of conditions common to setting up the stack pointer in a
1698 /// kernel, and for using a frame pointer in a callable function.
1699 ///
1700 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1701 /// references SP.
1702 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1703   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1704 }
1705 
1706 // The FP for kernels is always known 0, so we never really need to setup an
1707 // explicit register for it. However, DisableFramePointerElim will force us to
1708 // use a register for it.
1709 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1710   const MachineFrameInfo &MFI = MF.getFrameInfo();
1711 
1712   // For entry functions we can use an immediate offset in most cases, so the
1713   // presence of calls doesn't imply we need a distinct frame pointer.
1714   if (MFI.hasCalls() &&
1715       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1716     // All offsets are unsigned, so need to be addressed in the same direction
1717     // as stack growth.
1718 
1719     // FIXME: This function is pretty broken, since it can be called before the
1720     // frame layout is determined or CSR spills are inserted.
1721     return MFI.getStackSize() != 0;
1722   }
1723 
1724   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1725          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1726              MF) ||
1727          MF.getTarget().Options.DisableFramePointerElim(MF);
1728 }
1729 
1730 // This is essentially a reduced version of hasFP for entry functions. Since the
1731 // stack pointer is known 0 on entry to kernels, we never really need an FP
1732 // register. We may need to initialize the stack pointer depending on the frame
1733 // properties, which logically overlaps many of the cases where an ordinary
1734 // function would require an FP.
1735 bool SIFrameLowering::requiresStackPointerReference(
1736     const MachineFunction &MF) const {
1737   // Callable functions always require a stack pointer reference.
1738   assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1739          "only expected to call this for entry points");
1740 
1741   const MachineFrameInfo &MFI = MF.getFrameInfo();
1742 
1743   // Entry points ordinarily don't need to initialize SP. We have to set it up
1744   // for callees if there are any. Also note tail calls are impossible/don't
1745   // make any sense for kernels.
1746   if (MFI.hasCalls())
1747     return true;
1748 
1749   // We still need to initialize the SP if we're doing anything weird that
1750   // references the SP, like variable sized stack objects.
1751   return frameTriviallyRequiresSP(MFI);
1752 }
1753