xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (revision ac77b2621508c6a50ab01d07fe8d43795d908f05)
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LiveRegUnits.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "frame-info"
22 
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24   "amdgpu-spill-vgpr-to-agpr",
25   cl::desc("Enable spilling VGPRs to AGPRs"),
26   cl::ReallyHidden,
27   cl::init(true));
28 
29 // Find a register matching \p RC from \p LiveUnits which is unused and
30 // available throughout the function. On failure, returns AMDGPU::NoRegister.
31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32 // MCRegisters. This should reduce the number of iterations and avoid redundant
33 // checking.
34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35                                      const LiveRegUnits &LiveUnits,
36                                      const TargetRegisterClass &RC) {
37   for (MCRegister Reg : RC) {
38     if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39         !MRI.isReserved(Reg))
40       return Reg;
41   }
42   return MCRegister();
43 }
44 
45 // Find a scratch register that we can use in the prologue. We avoid using
46 // callee-save registers since they may appear to be free when this is called
47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
48 // when this is called from emitPrologue.
49 static MCRegister findScratchNonCalleeSaveRegister(
50     MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51     const TargetRegisterClass &RC, bool Unused = false) {
52   // Mark callee saved registers as used so we will not choose them.
53   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54   for (unsigned i = 0; CSRegs[i]; ++i)
55     LiveUnits.addReg(CSRegs[i]);
56 
57   // We are looking for a register that can be used throughout the entire
58   // function, so any use is unacceptable.
59   if (Unused)
60     return findUnusedRegister(MRI, LiveUnits, RC);
61 
62   for (MCRegister Reg : RC) {
63     if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64       return Reg;
65   }
66 
67   return MCRegister();
68 }
69 
70 /// Query target location for spilling SGPRs
71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs
72 static void getVGPRSpillLaneOrTempRegister(
73     MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74     const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75     bool IncludeScratchCopy = true) {
76   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78 
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81   unsigned Size = TRI->getSpillSize(RC);
82   Align Alignment = TRI->getSpillAlign(RC);
83 
84   // We need to save and restore the given SGPR.
85 
86   Register ScratchSGPR;
87   // 1: Try to save the given register into an unused scratch SGPR. The
88   // LiveUnits should have all the callee saved registers marked as used. For
89   // certain cases we skip copy to scratch SGPR.
90   if (IncludeScratchCopy)
91     ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92 
93   if (!ScratchSGPR) {
94     int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
95                                          TargetStackID::SGPRSpill);
96 
97     if (TRI->spillSGPRToVGPR() &&
98         MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99                                          /*IsPrologEpilog=*/true)) {
100       // 2: There's no free lane to spill, and no free register to save the
101       // SGPR, so we're forced to take another VGPR to use for the spill.
102       MFI->addToPrologEpilogSGPRSpills(
103           SGPR, PrologEpilogSGPRSaveRestoreInfo(
104                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105 
106       LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107                  dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108                         << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109                         << '\n';);
110     } else {
111       // Remove dead <FI> index
112       MF.getFrameInfo().RemoveStackObject(FI);
113       // 3: If all else fails, spill the register to memory.
114       FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115       MFI->addToPrologEpilogSGPRSpills(
116           SGPR,
117           PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118       LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119                         << printReg(SGPR, TRI) << '\n');
120     }
121   } else {
122     MFI->addToPrologEpilogSGPRSpills(
123         SGPR, PrologEpilogSGPRSaveRestoreInfo(
124                   SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125     LiveUnits.addReg(ScratchSGPR);
126     LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127                       << printReg(ScratchSGPR, TRI) << '\n');
128   }
129 }
130 
131 // We need to specially emit stack operations here because a different frame
132 // register is used than in the rest of the function, as getFrameRegister would
133 // use.
134 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135                              const SIMachineFunctionInfo &FuncInfo,
136                              LiveRegUnits &LiveUnits, MachineFunction &MF,
137                              MachineBasicBlock &MBB,
138                              MachineBasicBlock::iterator I, const DebugLoc &DL,
139                              Register SpillReg, int FI, Register FrameReg,
140                              int64_t DwordOff = 0) {
141   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143 
144   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146   MachineMemOperand *MMO = MF.getMachineMemOperand(
147       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148       FrameInfo.getObjectAlign(FI));
149   LiveUnits.addReg(SpillReg);
150   bool IsKill = !MBB.isLiveIn(SpillReg);
151   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152                           DwordOff, MMO, nullptr, &LiveUnits);
153   if (IsKill)
154     LiveUnits.removeReg(SpillReg);
155 }
156 
157 static void buildEpilogRestore(const GCNSubtarget &ST,
158                                const SIRegisterInfo &TRI,
159                                const SIMachineFunctionInfo &FuncInfo,
160                                LiveRegUnits &LiveUnits, MachineFunction &MF,
161                                MachineBasicBlock &MBB,
162                                MachineBasicBlock::iterator I,
163                                const DebugLoc &DL, Register SpillReg, int FI,
164                                Register FrameReg, int64_t DwordOff = 0) {
165   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167 
168   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170   MachineMemOperand *MMO = MF.getMachineMemOperand(
171       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172       FrameInfo.getObjectAlign(FI));
173   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174                           DwordOff, MMO, nullptr, &LiveUnits);
175 }
176 
177 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178                         const DebugLoc &DL, const SIInstrInfo *TII,
179                         Register TargetReg) {
180   MachineFunction *MF = MBB.getParent();
181   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184   Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185   Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186 
187   if (MFI->getGITPtrHigh() != 0xffffffff) {
188     BuildMI(MBB, I, DL, SMovB32, TargetHi)
189         .addImm(MFI->getGITPtrHigh())
190         .addReg(TargetReg, RegState::ImplicitDefine);
191   } else {
192     const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193     BuildMI(MBB, I, DL, GetPC64, TargetReg);
194   }
195   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196   MF->getRegInfo().addLiveIn(GitPtrLo);
197   MBB.addLiveIn(GitPtrLo);
198   BuildMI(MBB, I, DL, SMovB32, TargetLo)
199     .addReg(GitPtrLo);
200 }
201 
202 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203                           const SIMachineFunctionInfo *FuncInfo,
204                           MachineFunction &MF, MachineBasicBlock &MBB,
205                           MachineBasicBlock::iterator MBBI, bool IsProlog) {
206   if (LiveUnits.empty()) {
207     LiveUnits.init(TRI);
208     if (IsProlog) {
209       LiveUnits.addLiveIns(MBB);
210     } else {
211       // In epilog.
212       LiveUnits.addLiveOuts(MBB);
213       LiveUnits.stepBackward(*MBBI);
214     }
215   }
216 }
217 
218 namespace llvm {
219 
220 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221 // BP, etc. These spills are delayed until the current function's frame is
222 // finalized. For a given register, the builder uses the
223 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224 class PrologEpilogSGPRSpillBuilder {
225   MachineBasicBlock::iterator MI;
226   MachineBasicBlock &MBB;
227   MachineFunction &MF;
228   const GCNSubtarget &ST;
229   MachineFrameInfo &MFI;
230   SIMachineFunctionInfo *FuncInfo;
231   const SIInstrInfo *TII;
232   const SIRegisterInfo &TRI;
233   Register SuperReg;
234   const PrologEpilogSGPRSaveRestoreInfo SI;
235   LiveRegUnits &LiveUnits;
236   const DebugLoc &DL;
237   Register FrameReg;
238   ArrayRef<int16_t> SplitParts;
239   unsigned NumSubRegs;
240   unsigned EltSize = 4;
241 
242   void saveToMemory(const int FI) const {
243     MachineRegisterInfo &MRI = MF.getRegInfo();
244     assert(!MFI.isDeadObjectIndex(FI));
245 
246     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247 
248     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249         MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250     if (!TmpVGPR)
251       report_fatal_error("failed to find free scratch register");
252 
253     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254       Register SubReg = NumSubRegs == 1
255                             ? SuperReg
256                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258           .addReg(SubReg);
259 
260       buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261                        FI, FrameReg, DwordOff);
262       DwordOff += 4;
263     }
264   }
265 
266   void saveToVGPRLane(const int FI) const {
267     assert(!MFI.isDeadObjectIndex(FI));
268 
269     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271         FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
272     assert(Spill.size() == NumSubRegs);
273 
274     for (unsigned I = 0; I < NumSubRegs; ++I) {
275       Register SubReg = NumSubRegs == 1
276                             ? SuperReg
277                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279               Spill[I].VGPR)
280           .addReg(SubReg)
281           .addImm(Spill[I].Lane)
282           .addReg(Spill[I].VGPR, RegState::Undef);
283     }
284   }
285 
286   void copyToScratchSGPR(Register DstReg) const {
287     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288         .addReg(SuperReg)
289         .setMIFlag(MachineInstr::FrameSetup);
290   }
291 
292   void restoreFromMemory(const int FI) {
293     MachineRegisterInfo &MRI = MF.getRegInfo();
294 
295     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
296     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297         MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298     if (!TmpVGPR)
299       report_fatal_error("failed to find free scratch register");
300 
301     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302       Register SubReg = NumSubRegs == 1
303                             ? SuperReg
304                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305 
306       buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307                          TmpVGPR, FI, FrameReg, DwordOff);
308       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309           .addReg(TmpVGPR, RegState::Kill);
310       DwordOff += 4;
311     }
312   }
313 
314   void restoreFromVGPRLane(const int FI) {
315     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
316     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
317         FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
318     assert(Spill.size() == NumSubRegs);
319 
320     for (unsigned I = 0; I < NumSubRegs; ++I) {
321       Register SubReg = NumSubRegs == 1
322                             ? SuperReg
323                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325           .addReg(Spill[I].VGPR)
326           .addImm(Spill[I].Lane);
327     }
328   }
329 
330   void copyFromScratchSGPR(Register SrcReg) const {
331     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332         .addReg(SrcReg)
333         .setMIFlag(MachineInstr::FrameDestroy);
334   }
335 
336 public:
337   PrologEpilogSGPRSpillBuilder(Register Reg,
338                                const PrologEpilogSGPRSaveRestoreInfo SI,
339                                MachineBasicBlock &MBB,
340                                MachineBasicBlock::iterator MI,
341                                const DebugLoc &DL, const SIInstrInfo *TII,
342                                const SIRegisterInfo &TRI,
343                                LiveRegUnits &LiveUnits, Register FrameReg)
344       : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345         ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346         FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347         SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348         FrameReg(FrameReg) {
349     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350     SplitParts = TRI.getRegSplitParts(RC, EltSize);
351     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352 
353     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354   }
355 
356   void save() {
357     switch (SI.getKind()) {
358     case SGPRSaveKind::SPILL_TO_MEM:
359       return saveToMemory(SI.getIndex());
360     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
361       return saveToVGPRLane(SI.getIndex());
362     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
363       return copyToScratchSGPR(SI.getReg());
364     }
365   }
366 
367   void restore() {
368     switch (SI.getKind()) {
369     case SGPRSaveKind::SPILL_TO_MEM:
370       return restoreFromMemory(SI.getIndex());
371     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
372       return restoreFromVGPRLane(SI.getIndex());
373     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
374       return copyFromScratchSGPR(SI.getReg());
375     }
376   }
377 };
378 
379 } // namespace llvm
380 
381 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
382 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
383     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
384     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
385   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
386   const SIInstrInfo *TII = ST.getInstrInfo();
387   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
388   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
389 
390   // We don't need this if we only have spills since there is no user facing
391   // scratch.
392 
393   // TODO: If we know we don't have flat instructions earlier, we can omit
394   // this from the input registers.
395   //
396   // TODO: We only need to know if we access scratch space through a flat
397   // pointer. Because we only detect if flat instructions are used at all,
398   // this will be used more often than necessary on VI.
399 
400   Register FlatScrInitLo;
401   Register FlatScrInitHi;
402 
403   if (ST.isAmdPalOS()) {
404     // Extract the scratch offset from the descriptor in the GIT
405     LiveRegUnits LiveUnits;
406     LiveUnits.init(*TRI);
407     LiveUnits.addLiveIns(MBB);
408 
409     // Find unused reg to load flat scratch init into
410     MachineRegisterInfo &MRI = MF.getRegInfo();
411     Register FlatScrInit = AMDGPU::NoRegister;
412     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414     AllSGPR64s = AllSGPR64s.slice(
415         std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
416     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417     for (MCPhysReg Reg : AllSGPR64s) {
418       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
419           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420         FlatScrInit = Reg;
421         break;
422       }
423     }
424     assert(FlatScrInit && "Failed to find free register for scratch init");
425 
426     FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427     FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428 
429     buildGitPtr(MBB, I, DL, TII, FlatScrInit);
430 
431     // We now have the GIT ptr - now get the scratch descriptor from the entry
432     // at offset 0 (or offset 16 for a compute shader).
433     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
434     const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435     auto *MMO = MF.getMachineMemOperand(
436         PtrInfo,
437         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
438             MachineMemOperand::MODereferenceable,
439         8, Align(4));
440     unsigned Offset =
441         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
442     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444     BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
445         .addReg(FlatScrInit)
446         .addImm(EncodedOffset) // offset
447         .addImm(0)             // cpol
448         .addMemOperand(MMO);
449 
450     // Mask the offset in [47:0] of the descriptor
451     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
453         .addReg(FlatScrInitHi)
454         .addImm(0xffff);
455     And->getOperand(3).setIsDead(); // Mark SCC as dead.
456   } else {
457     Register FlatScratchInitReg =
458         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
459     assert(FlatScratchInitReg);
460 
461     MachineRegisterInfo &MRI = MF.getRegInfo();
462     MRI.addLiveIn(FlatScratchInitReg);
463     MBB.addLiveIn(FlatScratchInitReg);
464 
465     FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466     FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467   }
468 
469   // Do a 64-bit pointer add.
470   if (ST.flatScratchIsPointer()) {
471     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473         .addReg(FlatScrInitLo)
474         .addReg(ScratchWaveOffsetReg);
475       auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476                           FlatScrInitHi)
477         .addReg(FlatScrInitHi)
478         .addImm(0);
479       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480 
481       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
482         addReg(FlatScrInitLo).
483         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
484                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
485       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
486         addReg(FlatScrInitHi).
487         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
488                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
489       return;
490     }
491 
492     // For GFX9.
493     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
494       .addReg(FlatScrInitLo)
495       .addReg(ScratchWaveOffsetReg);
496     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
497                         AMDGPU::FLAT_SCR_HI)
498       .addReg(FlatScrInitHi)
499       .addImm(0);
500     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
501 
502     return;
503   }
504 
505   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
506 
507   // Copy the size in bytes.
508   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
509     .addReg(FlatScrInitHi, RegState::Kill);
510 
511   // Add wave offset in bytes to private base offset.
512   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
513   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
514       .addReg(FlatScrInitLo)
515       .addReg(ScratchWaveOffsetReg);
516 
517   // Convert offset to 256-byte units.
518   auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
519                       AMDGPU::FLAT_SCR_HI)
520     .addReg(FlatScrInitLo, RegState::Kill)
521     .addImm(8);
522   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
523 }
524 
525 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
526 // memory. They should have been removed by now.
527 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
528   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
529        I != E; ++I) {
530     if (!MFI.isDeadObjectIndex(I))
531       return false;
532   }
533 
534   return true;
535 }
536 
537 // Shift down registers reserved for the scratch RSRC.
538 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
539     MachineFunction &MF) const {
540 
541   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
542   const SIInstrInfo *TII = ST.getInstrInfo();
543   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
544   MachineRegisterInfo &MRI = MF.getRegInfo();
545   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
546 
547   assert(MFI->isEntryFunction());
548 
549   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
550 
551   if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
552                           allStackObjectsAreDead(MF.getFrameInfo())))
553     return Register();
554 
555   if (ST.hasSGPRInitBug() ||
556       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
557     return ScratchRsrcReg;
558 
559   // We reserved the last registers for this. Shift it down to the end of those
560   // which were actually used.
561   //
562   // FIXME: It might be safer to use a pseudoregister before replacement.
563 
564   // FIXME: We should be able to eliminate unused input registers. We only
565   // cannot do this for the resources required for scratch access. For now we
566   // skip over user SGPRs and may leave unused holes.
567 
568   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
569   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
570   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
571 
572   // Skip the last N reserved elements because they should have already been
573   // reserved for VCC etc.
574   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
575   for (MCPhysReg Reg : AllSGPR128s) {
576     // Pick the first unallocated one. Make sure we don't clobber the other
577     // reserved input we needed. Also for PAL, make sure we don't clobber
578     // the GIT pointer passed in SGPR0 or SGPR8.
579     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
580         (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
581       MRI.replaceRegWith(ScratchRsrcReg, Reg);
582       MFI->setScratchRSrcReg(Reg);
583       return Reg;
584     }
585   }
586 
587   return ScratchRsrcReg;
588 }
589 
590 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
591   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
592 }
593 
594 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
595                                                 MachineBasicBlock &MBB) const {
596   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
597 
598   // FIXME: If we only have SGPR spills, we won't actually be using scratch
599   // memory since these spill to VGPRs. We should be cleaning up these unused
600   // SGPR spill frame indices somewhere.
601 
602   // FIXME: We still have implicit uses on SGPR spill instructions in case they
603   // need to spill to vector memory. It's likely that will not happen, but at
604   // this point it appears we need the setup. This part of the prolog should be
605   // emitted after frame indices are eliminated.
606 
607   // FIXME: Remove all of the isPhysRegUsed checks
608 
609   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
610   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
611   const SIInstrInfo *TII = ST.getInstrInfo();
612   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
613   MachineRegisterInfo &MRI = MF.getRegInfo();
614   const Function &F = MF.getFunction();
615   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
616 
617   assert(MFI->isEntryFunction());
618 
619   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
620       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
621 
622   // We need to do the replacement of the private segment buffer register even
623   // if there are no stack objects. There could be stores to undef or a
624   // constant without an associated object.
625   //
626   // This will return `Register()` in cases where there are no actual
627   // uses of the SRSRC.
628   Register ScratchRsrcReg;
629   if (!ST.enableFlatScratch())
630     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
631 
632   // Make the selected register live throughout the function.
633   if (ScratchRsrcReg) {
634     for (MachineBasicBlock &OtherBB : MF) {
635       if (&OtherBB != &MBB) {
636         OtherBB.addLiveIn(ScratchRsrcReg);
637       }
638     }
639   }
640 
641   // Now that we have fixed the reserved SRSRC we need to locate the
642   // (potentially) preloaded SRSRC.
643   Register PreloadedScratchRsrcReg;
644   if (ST.isAmdHsaOrMesa(F)) {
645     PreloadedScratchRsrcReg =
646         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
647     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
648       // We added live-ins during argument lowering, but since they were not
649       // used they were deleted. We're adding the uses now, so add them back.
650       MRI.addLiveIn(PreloadedScratchRsrcReg);
651       MBB.addLiveIn(PreloadedScratchRsrcReg);
652     }
653   }
654 
655   // Debug location must be unknown since the first debug location is used to
656   // determine the end of the prologue.
657   DebugLoc DL;
658   MachineBasicBlock::iterator I = MBB.begin();
659 
660   // We found the SRSRC first because it needs four registers and has an
661   // alignment requirement. If the SRSRC that we found is clobbering with
662   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
663   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
664   // wave offset to a free SGPR.
665   Register ScratchWaveOffsetReg;
666   if (PreloadedScratchWaveOffsetReg &&
667       TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
668     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
669     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
670     AllSGPRs = AllSGPRs.slice(
671         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
672     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
673     for (MCPhysReg Reg : AllSGPRs) {
674       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
675           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
676         ScratchWaveOffsetReg = Reg;
677         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
678             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
679         break;
680       }
681     }
682   } else {
683     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
684   }
685   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
686 
687   if (requiresStackPointerReference(MF)) {
688     Register SPReg = MFI->getStackPtrOffsetReg();
689     assert(SPReg != AMDGPU::SP_REG);
690     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
691         .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
692   }
693 
694   if (hasFP(MF)) {
695     Register FPReg = MFI->getFrameOffsetReg();
696     assert(FPReg != AMDGPU::FP_REG);
697     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
698   }
699 
700   bool NeedsFlatScratchInit =
701       MFI->getUserSGPRInfo().hasFlatScratchInit() &&
702       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
703        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
704 
705   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
706       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
707     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
708     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
709   }
710 
711   if (NeedsFlatScratchInit) {
712     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
713   }
714 
715   if (ScratchRsrcReg) {
716     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
717                                          PreloadedScratchRsrcReg,
718                                          ScratchRsrcReg, ScratchWaveOffsetReg);
719   }
720 }
721 
722 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
723 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
724     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
725     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
726     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
727 
728   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
729   const SIInstrInfo *TII = ST.getInstrInfo();
730   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
731   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
732   const Function &Fn = MF.getFunction();
733 
734   if (ST.isAmdPalOS()) {
735     // The pointer to the GIT is formed from the offset passed in and either
736     // the amdgpu-git-ptr-high function attribute or the top part of the PC
737     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
738     Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
739 
740     buildGitPtr(MBB, I, DL, TII, Rsrc01);
741 
742     // We now have the GIT ptr - now get the scratch descriptor from the entry
743     // at offset 0 (or offset 16 for a compute shader).
744     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
745     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
746     auto MMO = MF.getMachineMemOperand(PtrInfo,
747                                        MachineMemOperand::MOLoad |
748                                            MachineMemOperand::MOInvariant |
749                                            MachineMemOperand::MODereferenceable,
750                                        16, Align(4));
751     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
752     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
753     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
754     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
755       .addReg(Rsrc01)
756       .addImm(EncodedOffset) // offset
757       .addImm(0) // cpol
758       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
759       .addMemOperand(MMO);
760 
761     // The driver will always set the SRD for wave 64 (bits 118:117 of
762     // descriptor / bits 22:21 of third sub-reg will be 0b11)
763     // If the shader is actually wave32 we have to modify the const_index_stride
764     // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
765     // reason the driver does this is that there can be cases where it presents
766     // 2 shaders with different wave size (e.g. VsFs).
767     // TODO: convert to using SCRATCH instructions or multiple SRD buffers
768     if (ST.isWave32()) {
769       const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
770       BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
771           .addImm(21)
772           .addReg(Rsrc03);
773     }
774   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
775     assert(!ST.isAmdHsaOrMesa(Fn));
776     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
777 
778     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
779     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
780 
781     // Use relocations to get the pointer, and setup the other bits manually.
782     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
783 
784     if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
785       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
786 
787       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
788         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
789 
790         BuildMI(MBB, I, DL, Mov64, Rsrc01)
791           .addReg(MFI->getImplicitBufferPtrUserSGPR())
792           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
793       } else {
794         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
795 
796         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
797         auto MMO = MF.getMachineMemOperand(
798             PtrInfo,
799             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
800                 MachineMemOperand::MODereferenceable,
801             8, Align(4));
802         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
803           .addReg(MFI->getImplicitBufferPtrUserSGPR())
804           .addImm(0) // offset
805           .addImm(0) // cpol
806           .addMemOperand(MMO)
807           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
808 
809         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
810         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
811       }
812     } else {
813       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
814       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
815 
816       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
817         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
818         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
819 
820       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
821         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
822         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
823     }
824 
825     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
826       .addImm(Rsrc23 & 0xffffffff)
827       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
828 
829     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
830       .addImm(Rsrc23 >> 32)
831       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
832   } else if (ST.isAmdHsaOrMesa(Fn)) {
833     assert(PreloadedScratchRsrcReg);
834 
835     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
836       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
837           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
838     }
839   }
840 
841   // Add the scratch wave offset into the scratch RSRC.
842   //
843   // We only want to update the first 48 bits, which is the base address
844   // pointer, without touching the adjacent 16 bits of flags. We know this add
845   // cannot carry-out from bit 47, otherwise the scratch allocation would be
846   // impossible to fit in the 48-bit global address space.
847   //
848   // TODO: Evaluate if it is better to just construct an SRD using the flat
849   // scratch init and some constants rather than update the one we are passed.
850   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
851   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
852 
853   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
854   // the kernel body via inreg arguments.
855   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
856       .addReg(ScratchRsrcSub0)
857       .addReg(ScratchWaveOffsetReg)
858       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
859   auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
860       .addReg(ScratchRsrcSub1)
861       .addImm(0)
862       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
863   Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
864 }
865 
866 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
867   switch (ID) {
868   case TargetStackID::Default:
869   case TargetStackID::NoAlloc:
870   case TargetStackID::SGPRSpill:
871     return true;
872   case TargetStackID::ScalableVector:
873   case TargetStackID::WasmLocal:
874     return false;
875   }
876   llvm_unreachable("Invalid TargetStackID::Value");
877 }
878 
879 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
880 // Otherwise, activate all lanes. It returns the saved exec.
881 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
882                                      MachineFunction &MF,
883                                      MachineBasicBlock &MBB,
884                                      MachineBasicBlock::iterator MBBI,
885                                      const DebugLoc &DL, bool IsProlog,
886                                      bool EnableInactiveLanes) {
887   Register ScratchExecCopy;
888   MachineRegisterInfo &MRI = MF.getRegInfo();
889   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
890   const SIInstrInfo *TII = ST.getInstrInfo();
891   const SIRegisterInfo &TRI = TII->getRegisterInfo();
892   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
893 
894   initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
895 
896   ScratchExecCopy = findScratchNonCalleeSaveRegister(
897       MRI, LiveUnits, *TRI.getWaveMaskRegClass());
898   if (!ScratchExecCopy)
899     report_fatal_error("failed to find free scratch register");
900 
901   LiveUnits.addReg(ScratchExecCopy);
902 
903   const unsigned SaveExecOpc =
904       ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
905                                            : AMDGPU::S_OR_SAVEEXEC_B32)
906                     : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
907                                            : AMDGPU::S_OR_SAVEEXEC_B64);
908   auto SaveExec =
909       BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
910   SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
911 
912   return ScratchExecCopy;
913 }
914 
915 void SIFrameLowering::emitCSRSpillStores(
916     MachineFunction &MF, MachineBasicBlock &MBB,
917     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
918     Register FrameReg, Register FramePtrRegScratchCopy) const {
919   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
920   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
921   const SIInstrInfo *TII = ST.getInstrInfo();
922   const SIRegisterInfo &TRI = TII->getRegisterInfo();
923 
924   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
925   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
926   // might end up flipping the EXEC bits twice.
927   Register ScratchExecCopy;
928   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
929   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
930   if (!WWMScratchRegs.empty())
931     ScratchExecCopy =
932         buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
933                              /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
934 
935   auto StoreWWMRegisters =
936       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
937         for (const auto &Reg : WWMRegs) {
938           Register VGPR = Reg.first;
939           int FI = Reg.second;
940           buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
941                            VGPR, FI, FrameReg);
942         }
943       };
944 
945   StoreWWMRegisters(WWMScratchRegs);
946   if (!WWMCalleeSavedRegs.empty()) {
947     if (ScratchExecCopy) {
948       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
949       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
950     } else {
951       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
952                                              /*IsProlog*/ true,
953                                              /*EnableInactiveLanes*/ false);
954     }
955   }
956 
957   StoreWWMRegisters(WWMCalleeSavedRegs);
958   if (ScratchExecCopy) {
959     // FIXME: Split block and make terminator.
960     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
961     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
962         .addReg(ScratchExecCopy, RegState::Kill);
963     LiveUnits.addReg(ScratchExecCopy);
964   }
965 
966   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
967 
968   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
969     // Special handle FP spill:
970     // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
971     // Otherwise, FP has been moved to a temporary register and spill it
972     // instead.
973     Register Reg =
974         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
975     if (!Reg)
976       continue;
977 
978     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
979                                     LiveUnits, FrameReg);
980     SB.save();
981   }
982 
983   // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
984   // such scratch registers live throughout the function.
985   SmallVector<Register, 1> ScratchSGPRs;
986   FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
987   if (!ScratchSGPRs.empty()) {
988     for (MachineBasicBlock &MBB : MF) {
989       for (MCPhysReg Reg : ScratchSGPRs)
990         MBB.addLiveIn(Reg);
991 
992       MBB.sortUniqueLiveIns();
993     }
994     if (!LiveUnits.empty()) {
995       for (MCPhysReg Reg : ScratchSGPRs)
996         LiveUnits.addReg(Reg);
997     }
998   }
999 }
1000 
1001 void SIFrameLowering::emitCSRSpillRestores(
1002     MachineFunction &MF, MachineBasicBlock &MBB,
1003     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1004     Register FrameReg, Register FramePtrRegScratchCopy) const {
1005   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1006   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1007   const SIInstrInfo *TII = ST.getInstrInfo();
1008   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1009   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1010 
1011   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1012     // Special handle FP restore:
1013     // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1014     // the FP value to a temporary register. The frame pointer should be
1015     // overwritten only at the end when all other spills are restored from
1016     // current frame.
1017     Register Reg =
1018         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1019     if (!Reg)
1020       continue;
1021 
1022     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1023                                     LiveUnits, FrameReg);
1024     SB.restore();
1025   }
1026 
1027   // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1028   // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1029   // this, we might end up flipping the EXEC bits twice.
1030   Register ScratchExecCopy;
1031   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1032   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1033   if (!WWMScratchRegs.empty())
1034     ScratchExecCopy =
1035         buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1036                              /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1037 
1038   auto RestoreWWMRegisters =
1039       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1040         for (const auto &Reg : WWMRegs) {
1041           Register VGPR = Reg.first;
1042           int FI = Reg.second;
1043           buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1044                              VGPR, FI, FrameReg);
1045         }
1046       };
1047 
1048   RestoreWWMRegisters(WWMScratchRegs);
1049   if (!WWMCalleeSavedRegs.empty()) {
1050     if (ScratchExecCopy) {
1051       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1052       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1053     } else {
1054       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1055                                              /*IsProlog*/ false,
1056                                              /*EnableInactiveLanes*/ false);
1057     }
1058   }
1059 
1060   RestoreWWMRegisters(WWMCalleeSavedRegs);
1061   if (ScratchExecCopy) {
1062     // FIXME: Split block and make terminator.
1063     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1064     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1065         .addReg(ScratchExecCopy, RegState::Kill);
1066   }
1067 }
1068 
1069 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1070                                    MachineBasicBlock &MBB) const {
1071   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1072   if (FuncInfo->isEntryFunction()) {
1073     emitEntryFunctionPrologue(MF, MBB);
1074     return;
1075   }
1076 
1077   MachineFrameInfo &MFI = MF.getFrameInfo();
1078   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1079   const SIInstrInfo *TII = ST.getInstrInfo();
1080   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1081   MachineRegisterInfo &MRI = MF.getRegInfo();
1082 
1083   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1084   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1085   Register BasePtrReg =
1086       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1087   LiveRegUnits LiveUnits;
1088 
1089   MachineBasicBlock::iterator MBBI = MBB.begin();
1090   // DebugLoc must be unknown since the first instruction with DebugLoc is used
1091   // to determine the end of the prologue.
1092   DebugLoc DL;
1093 
1094   if (FuncInfo->isChainFunction()) {
1095     // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1096     // are free to set one up if they need it.
1097     bool UseSP = requiresStackPointerReference(MF);
1098     if (UseSP) {
1099       assert(StackPtrReg != AMDGPU::SP_REG);
1100 
1101       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1102           .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1103     }
1104   }
1105 
1106   bool HasFP = false;
1107   bool HasBP = false;
1108   uint32_t NumBytes = MFI.getStackSize();
1109   uint32_t RoundedSize = NumBytes;
1110 
1111   if (TRI.hasStackRealignment(MF))
1112     HasFP = true;
1113 
1114   Register FramePtrRegScratchCopy;
1115   if (!HasFP && !hasFP(MF)) {
1116     // Emit the CSR spill stores with SP base register.
1117     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1118                        FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1119                        FramePtrRegScratchCopy);
1120   } else {
1121     // CSR spill stores will use FP as base register.
1122     Register SGPRForFPSaveRestoreCopy =
1123         FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1124 
1125     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1126     if (SGPRForFPSaveRestoreCopy) {
1127       // Copy FP to the scratch register now and emit the CFI entry. It avoids
1128       // the extra FP copy needed in the other two cases when FP is spilled to
1129       // memory or to a VGPR lane.
1130       PrologEpilogSGPRSpillBuilder SB(
1131           FramePtrReg,
1132           FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1133           DL, TII, TRI, LiveUnits, FramePtrReg);
1134       SB.save();
1135       LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1136     } else {
1137       // Copy FP into a new scratch register so that its previous value can be
1138       // spilled after setting up the new frame.
1139       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1140           MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1141       if (!FramePtrRegScratchCopy)
1142         report_fatal_error("failed to find free scratch register");
1143 
1144       LiveUnits.addReg(FramePtrRegScratchCopy);
1145       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1146           .addReg(FramePtrReg);
1147     }
1148   }
1149 
1150   if (HasFP) {
1151     const unsigned Alignment = MFI.getMaxAlign().value();
1152 
1153     RoundedSize += Alignment;
1154     if (LiveUnits.empty()) {
1155       LiveUnits.init(TRI);
1156       LiveUnits.addLiveIns(MBB);
1157     }
1158 
1159     // s_add_i32 s33, s32, NumBytes
1160     // s_and_b32 s33, s33, 0b111...0000
1161     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1162         .addReg(StackPtrReg)
1163         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1164         .setMIFlag(MachineInstr::FrameSetup);
1165     auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1166         .addReg(FramePtrReg, RegState::Kill)
1167         .addImm(-Alignment * getScratchScaleFactor(ST))
1168         .setMIFlag(MachineInstr::FrameSetup);
1169     And->getOperand(3).setIsDead(); // Mark SCC as dead.
1170     FuncInfo->setIsStackRealigned(true);
1171   } else if ((HasFP = hasFP(MF))) {
1172     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1173         .addReg(StackPtrReg)
1174         .setMIFlag(MachineInstr::FrameSetup);
1175   }
1176 
1177   // If FP is used, emit the CSR spills with FP base register.
1178   if (HasFP) {
1179     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1180                        FramePtrRegScratchCopy);
1181     if (FramePtrRegScratchCopy)
1182       LiveUnits.removeReg(FramePtrRegScratchCopy);
1183   }
1184 
1185   // If we need a base pointer, set it up here. It's whatever the value of
1186   // the stack pointer is at this point. Any variable size objects will be
1187   // allocated after this, so we can still use the base pointer to reference
1188   // the incoming arguments.
1189   if ((HasBP = TRI.hasBasePointer(MF))) {
1190     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1191         .addReg(StackPtrReg)
1192         .setMIFlag(MachineInstr::FrameSetup);
1193   }
1194 
1195   if (HasFP && RoundedSize != 0) {
1196     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1197         .addReg(StackPtrReg)
1198         .addImm(RoundedSize * getScratchScaleFactor(ST))
1199         .setMIFlag(MachineInstr::FrameSetup);
1200     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1201   }
1202 
1203   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1204   (void)FPSaved;
1205   assert((!HasFP || FPSaved) &&
1206          "Needed to save FP but didn't save it anywhere");
1207 
1208   // If we allow spilling to AGPRs we may have saved FP but then spill
1209   // everything into AGPRs instead of the stack.
1210   assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1211          "Saved FP but didn't need it");
1212 
1213   bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1214   (void)BPSaved;
1215   assert((!HasBP || BPSaved) &&
1216          "Needed to save BP but didn't save it anywhere");
1217 
1218   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1219 }
1220 
1221 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1222                                    MachineBasicBlock &MBB) const {
1223   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1224   if (FuncInfo->isEntryFunction())
1225     return;
1226 
1227   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1228   const SIInstrInfo *TII = ST.getInstrInfo();
1229   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1230   MachineRegisterInfo &MRI = MF.getRegInfo();
1231   LiveRegUnits LiveUnits;
1232   // Get the insert location for the epilogue. If there were no terminators in
1233   // the block, get the last instruction.
1234   MachineBasicBlock::iterator MBBI = MBB.end();
1235   DebugLoc DL;
1236   if (!MBB.empty()) {
1237     MBBI = MBB.getLastNonDebugInstr();
1238     if (MBBI != MBB.end())
1239       DL = MBBI->getDebugLoc();
1240 
1241     MBBI = MBB.getFirstTerminator();
1242   }
1243 
1244   const MachineFrameInfo &MFI = MF.getFrameInfo();
1245   uint32_t NumBytes = MFI.getStackSize();
1246   uint32_t RoundedSize = FuncInfo->isStackRealigned()
1247                              ? NumBytes + MFI.getMaxAlign().value()
1248                              : NumBytes;
1249   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1250   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1251   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1252 
1253   Register FramePtrRegScratchCopy;
1254   Register SGPRForFPSaveRestoreCopy =
1255       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1256   if (FPSaved) {
1257     // CSR spill restores should use FP as base register. If
1258     // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1259     // into a new scratch register and copy to FP later when other registers are
1260     // restored from the current stack frame.
1261     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1262     if (SGPRForFPSaveRestoreCopy) {
1263       LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1264     } else {
1265       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1266           MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1267       if (!FramePtrRegScratchCopy)
1268         report_fatal_error("failed to find free scratch register");
1269 
1270       LiveUnits.addReg(FramePtrRegScratchCopy);
1271     }
1272 
1273     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1274                          FramePtrRegScratchCopy);
1275   }
1276 
1277   if (RoundedSize != 0 && hasFP(MF)) {
1278     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1279         .addReg(StackPtrReg)
1280         .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1281         .setMIFlag(MachineInstr::FrameDestroy);
1282     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1283   }
1284 
1285   if (FPSaved) {
1286     // Insert the copy to restore FP.
1287     Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1288                                                : FramePtrRegScratchCopy;
1289     MachineInstrBuilder MIB =
1290         BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1291             .addReg(SrcReg);
1292     if (SGPRForFPSaveRestoreCopy)
1293       MIB.setMIFlag(MachineInstr::FrameDestroy);
1294   } else {
1295     // Insert the CSR spill restores with SP as the base register.
1296     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1297                          FramePtrRegScratchCopy);
1298   }
1299 }
1300 
1301 #ifndef NDEBUG
1302 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1303   const MachineFrameInfo &MFI = MF.getFrameInfo();
1304   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1305   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1306        I != E; ++I) {
1307     if (!MFI.isDeadObjectIndex(I) &&
1308         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1309         !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1310       return false;
1311     }
1312   }
1313 
1314   return true;
1315 }
1316 #endif
1317 
1318 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1319                                                     int FI,
1320                                                     Register &FrameReg) const {
1321   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1322 
1323   FrameReg = RI->getFrameRegister(MF);
1324   return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1325 }
1326 
1327 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1328   MachineFunction &MF,
1329   RegScavenger *RS) const {
1330   MachineFrameInfo &MFI = MF.getFrameInfo();
1331 
1332   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1333   const SIInstrInfo *TII = ST.getInstrInfo();
1334   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1335   MachineRegisterInfo &MRI = MF.getRegInfo();
1336   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1337 
1338   // Allocate spill slots for WWM reserved VGPRs.
1339   // For chain functions, we only need to do this if we have calls to
1340   // llvm.amdgcn.cs.chain.
1341   bool IsChainWithoutCalls =
1342       FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1343   if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1344     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1345       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1346       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1347                                  TRI->getSpillAlign(*RC));
1348     }
1349   }
1350 
1351   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1352                                && EnableSpillVGPRToAGPR;
1353 
1354   if (SpillVGPRToAGPR) {
1355     // To track the spill frame indices handled in this pass.
1356     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1357     BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1358 
1359     bool SeenDbgInstr = false;
1360 
1361     for (MachineBasicBlock &MBB : MF) {
1362       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1363         int FrameIndex;
1364         if (MI.isDebugInstr())
1365           SeenDbgInstr = true;
1366 
1367         if (TII->isVGPRSpill(MI)) {
1368           // Try to eliminate stack used by VGPR spills before frame
1369           // finalization.
1370           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1371                                                      AMDGPU::OpName::vaddr);
1372           int FI = MI.getOperand(FIOp).getIndex();
1373           Register VReg =
1374             TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1375           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1376                                                 TRI->isAGPR(MRI, VReg))) {
1377             assert(RS != nullptr);
1378             RS->enterBasicBlockEnd(MBB);
1379             RS->backward(std::next(MI.getIterator()));
1380             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1381             SpillFIs.set(FI);
1382             continue;
1383           }
1384         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1385                    TII->isLoadFromStackSlot(MI, FrameIndex))
1386           if (!MFI.isFixedObjectIndex(FrameIndex))
1387             NonVGPRSpillFIs.set(FrameIndex);
1388       }
1389     }
1390 
1391     // Stack slot coloring may assign different objects to the same stack slot.
1392     // If not, then the VGPR to AGPR spill slot is dead.
1393     for (unsigned FI : SpillFIs.set_bits())
1394       if (!NonVGPRSpillFIs.test(FI))
1395         FuncInfo->setVGPRToAGPRSpillDead(FI);
1396 
1397     for (MachineBasicBlock &MBB : MF) {
1398       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1399         MBB.addLiveIn(Reg);
1400 
1401       for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1402         MBB.addLiveIn(Reg);
1403 
1404       MBB.sortUniqueLiveIns();
1405 
1406       if (!SpillFIs.empty() && SeenDbgInstr) {
1407         // FIXME: The dead frame indices are replaced with a null register from
1408         // the debug value instructions. We should instead, update it with the
1409         // correct register value. But not sure the register value alone is
1410         for (MachineInstr &MI : MBB) {
1411           if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1412               !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1413               SpillFIs[MI.getOperand(0).getIndex()]) {
1414             MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1415           }
1416         }
1417       }
1418     }
1419   }
1420 
1421   // At this point we've already allocated all spilled SGPRs to VGPRs if we
1422   // can. Any remaining SGPR spills will go to memory, so move them back to the
1423   // default stack.
1424   bool HaveSGPRToVMemSpill =
1425       FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1426   assert(allSGPRSpillsAreDead(MF) &&
1427          "SGPR spill should have been removed in SILowerSGPRSpills");
1428 
1429   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1430   // but currently hasNonSpillStackObjects is set only from source
1431   // allocas. Stack temps produced from legalization are not counted currently.
1432   if (!allStackObjectsAreDead(MFI)) {
1433     assert(RS && "RegScavenger required if spilling");
1434 
1435     // Add an emergency spill slot
1436     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1437 
1438     // If we are spilling SGPRs to memory with a large frame, we may need a
1439     // second VGPR emergency frame index.
1440     if (HaveSGPRToVMemSpill &&
1441         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1442       RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1443     }
1444   }
1445 }
1446 
1447 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1448     MachineFunction &MF, RegScavenger *RS) const {
1449   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1450   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1451   MachineRegisterInfo &MRI = MF.getRegInfo();
1452   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1453 
1454   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1455     // On gfx908, we had initially reserved highest available VGPR for AGPR
1456     // copy. Now since we are done with RA, check if there exist an unused VGPR
1457     // which is lower than the eariler reserved VGPR before RA. If one exist,
1458     // use it for AGPR copy instead of one reserved before RA.
1459     Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1460     Register UnusedLowVGPR =
1461         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1462     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1463                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1464       // Reserve this newly identified VGPR (for AGPR copy)
1465       // reserved registers should already be frozen at this point
1466       // so we can avoid calling MRI.freezeReservedRegs and just use
1467       // MRI.reserveReg
1468       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1469       MRI.reserveReg(UnusedLowVGPR, TRI);
1470     }
1471   }
1472   // We initally reserved the highest available SGPR pair for long branches
1473   // now, after RA, we shift down to a lower unused one if one exists
1474   Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1475   Register UnusedLowSGPR =
1476       TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1477   // If LongBranchReservedReg is null then we didn't find a long branch
1478   // and never reserved a register to begin with so there is nothing to
1479   // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1480   // register to use so just keep the original one we set.
1481   if (LongBranchReservedReg && UnusedLowSGPR) {
1482     FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1483     MRI.reserveReg(UnusedLowSGPR, TRI);
1484   }
1485 }
1486 
1487 // The special SGPR spills like the one needed for FP, BP or any reserved
1488 // registers delayed until frame lowering.
1489 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1490     MachineFunction &MF, BitVector &SavedVGPRs,
1491     bool NeedExecCopyReservedReg) const {
1492   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1493   MachineRegisterInfo &MRI = MF.getRegInfo();
1494   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1495   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1496   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1497   LiveRegUnits LiveUnits;
1498   LiveUnits.init(*TRI);
1499   // Initially mark callee saved registers as used so we will not choose them
1500   // while looking for scratch SGPRs.
1501   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1502   for (unsigned I = 0; CSRegs[I]; ++I)
1503     LiveUnits.addReg(CSRegs[I]);
1504 
1505   const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1506 
1507   if (NeedExecCopyReservedReg) {
1508     Register ReservedReg = MFI->getSGPRForEXECCopy();
1509     assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
1510     Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1511     if (UnusedScratchReg) {
1512       // If found any unused scratch SGPR, reserve the register itself for Exec
1513       // copy and there is no need for any spills in that case.
1514       MFI->setSGPRForEXECCopy(UnusedScratchReg);
1515       LiveUnits.addReg(UnusedScratchReg);
1516     } else {
1517       // Needs spill.
1518       assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
1519              "Re-reserving spill slot for EXEC copy register");
1520       getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC,
1521                                      /*IncludeScratchCopy=*/false);
1522     }
1523   }
1524 
1525   // hasFP only knows about stack objects that already exist. We're now
1526   // determining the stack slots that will be created, so we have to predict
1527   // them. Stack objects force FP usage with calls.
1528   //
1529   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1530   // don't want to report it here.
1531   //
1532   // FIXME: Is this really hasReservedCallFrame?
1533   const bool WillHaveFP =
1534       FrameInfo.hasCalls() &&
1535       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1536 
1537   if (WillHaveFP || hasFP(MF)) {
1538     Register FramePtrReg = MFI->getFrameOffsetReg();
1539     assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1540            "Re-reserving spill slot for FP");
1541     getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1542   }
1543 
1544   if (TRI->hasBasePointer(MF)) {
1545     Register BasePtrReg = TRI->getBaseRegister();
1546     assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1547            "Re-reserving spill slot for BP");
1548     getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1549   }
1550 }
1551 
1552 // Only report VGPRs to generic code.
1553 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1554                                            BitVector &SavedVGPRs,
1555                                            RegScavenger *RS) const {
1556   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1557 
1558   // If this is a function with the amdgpu_cs_chain[_preserve] calling
1559   // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1560   // we don't need to save and restore anything.
1561   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1562     return;
1563 
1564   MFI->shiftSpillPhysVGPRsToLowestRange(MF);
1565 
1566   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1567   if (MFI->isEntryFunction())
1568     return;
1569 
1570   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1571   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1572   const SIInstrInfo *TII = ST.getInstrInfo();
1573   bool NeedExecCopyReservedReg = false;
1574 
1575   MachineInstr *ReturnMI = nullptr;
1576   for (MachineBasicBlock &MBB : MF) {
1577     for (MachineInstr &MI : MBB) {
1578       // WRITELANE instructions used for SGPR spills can overwrite the inactive
1579       // lanes of VGPRs and callee must spill and restore them even if they are
1580       // marked Caller-saved.
1581 
1582       // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1583       // here would be a bad heuristic. A better way should be by calling
1584       // allocateWWMSpill during the regalloc pipeline whenever a physical
1585       // register is allocated for the intended virtual registers.
1586       if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1587         MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1588       else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1589         MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1590       else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1591         NeedExecCopyReservedReg = true;
1592       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1593                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1594                (MFI->isChainFunction() &&
1595                 TII->isChainCallOpcode(MI.getOpcode()))) {
1596         // We expect all return to be the same size.
1597         assert(!ReturnMI ||
1598                (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1599                 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1600         ReturnMI = &MI;
1601       }
1602     }
1603   }
1604 
1605   // Remove any VGPRs used in the return value because these do not need to be saved.
1606   // This prevents CSR restore from clobbering return VGPRs.
1607   if (ReturnMI) {
1608     for (auto &Op : ReturnMI->operands()) {
1609       if (Op.isReg())
1610         SavedVGPRs.reset(Op.getReg());
1611     }
1612   }
1613 
1614   // Ignore the SGPRs the default implementation found.
1615   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1616 
1617   // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1618   // In gfx908 there was do AGPR loads and stores and thus spilling also
1619   // require a temporary VGPR.
1620   if (!ST.hasGFX90AInsts())
1621     SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1622 
1623   determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1624 
1625   // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1626   // allow the default insertion to handle them.
1627   for (auto &Reg : MFI->getWWMSpills())
1628     SavedVGPRs.reset(Reg.first);
1629 
1630   // Mark all lane VGPRs as BB LiveIns.
1631   for (MachineBasicBlock &MBB : MF) {
1632     for (auto &Reg : MFI->getWWMSpills())
1633       MBB.addLiveIn(Reg.first);
1634 
1635     MBB.sortUniqueLiveIns();
1636   }
1637 }
1638 
1639 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1640                                                BitVector &SavedRegs,
1641                                                RegScavenger *RS) const {
1642   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1643   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1644   if (MFI->isEntryFunction())
1645     return;
1646 
1647   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1648   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1649 
1650   // The SP is specifically managed and we don't want extra spills of it.
1651   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1652 
1653   const BitVector AllSavedRegs = SavedRegs;
1654   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1655 
1656   // We have to anticipate introducing CSR VGPR spills or spill of caller
1657   // save VGPR reserved for SGPR spills as we now always create stack entry
1658   // for it, if we don't have any stack objects already, since we require a FP
1659   // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1660   // there are any SGPR spills. Whether they are CSR spills or otherwise.
1661   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1662   const bool WillHaveFP =
1663       FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1664 
1665   // FP will be specially managed like SP.
1666   if (WillHaveFP || hasFP(MF))
1667     SavedRegs.reset(MFI->getFrameOffsetReg());
1668 
1669   // Return address use with return instruction is hidden through the SI_RETURN
1670   // pseudo. Given that and since the IPRA computes actual register usage and
1671   // does not use CSR list, the clobbering of return address by function calls
1672   // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1673   // usage collection. This will ensure save/restore of return address happens
1674   // in those scenarios.
1675   const MachineRegisterInfo &MRI = MF.getRegInfo();
1676   Register RetAddrReg = TRI->getReturnAddressReg(MF);
1677   if (!MFI->isEntryFunction() &&
1678       (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1679     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1680     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1681   }
1682 }
1683 
1684 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1685     MachineFunction &MF, const TargetRegisterInfo *TRI,
1686     std::vector<CalleeSavedInfo> &CSI) const {
1687   if (CSI.empty())
1688     return true; // Early exit if no callee saved registers are modified!
1689 
1690   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1691   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1692   const SIRegisterInfo *RI = ST.getRegisterInfo();
1693   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1694   Register BasePtrReg = RI->getBaseRegister();
1695   Register SGPRForFPSaveRestoreCopy =
1696       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1697   Register SGPRForBPSaveRestoreCopy =
1698       FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1699   if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1700     return false;
1701 
1702   unsigned NumModifiedRegs = 0;
1703 
1704   if (SGPRForFPSaveRestoreCopy)
1705     NumModifiedRegs++;
1706   if (SGPRForBPSaveRestoreCopy)
1707     NumModifiedRegs++;
1708 
1709   for (auto &CS : CSI) {
1710     if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1711       CS.setDstReg(SGPRForFPSaveRestoreCopy);
1712       if (--NumModifiedRegs)
1713         break;
1714     } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1715       CS.setDstReg(SGPRForBPSaveRestoreCopy);
1716       if (--NumModifiedRegs)
1717         break;
1718     }
1719   }
1720 
1721   return false;
1722 }
1723 
1724 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1725   const MachineFunction &MF) const {
1726 
1727   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1728   const MachineFrameInfo &MFI = MF.getFrameInfo();
1729   const SIInstrInfo *TII = ST.getInstrInfo();
1730   uint64_t EstStackSize = MFI.estimateStackSize(MF);
1731   uint64_t MaxOffset = EstStackSize - 1;
1732 
1733   // We need the emergency stack slots to be allocated in range of the
1734   // MUBUF/flat scratch immediate offset from the base register, so assign these
1735   // first at the incoming SP position.
1736   //
1737   // TODO: We could try sorting the objects to find a hole in the first bytes
1738   // rather than allocating as close to possible. This could save a lot of space
1739   // on frames with alignment requirements.
1740   if (ST.enableFlatScratch()) {
1741     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1742                                SIInstrFlags::FlatScratch))
1743       return false;
1744   } else {
1745     if (TII->isLegalMUBUFImmOffset(MaxOffset))
1746       return false;
1747   }
1748 
1749   return true;
1750 }
1751 
1752 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1753   MachineFunction &MF,
1754   MachineBasicBlock &MBB,
1755   MachineBasicBlock::iterator I) const {
1756   int64_t Amount = I->getOperand(0).getImm();
1757   if (Amount == 0)
1758     return MBB.erase(I);
1759 
1760   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1761   const SIInstrInfo *TII = ST.getInstrInfo();
1762   const DebugLoc &DL = I->getDebugLoc();
1763   unsigned Opc = I->getOpcode();
1764   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1765   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1766 
1767   if (!hasReservedCallFrame(MF)) {
1768     Amount = alignTo(Amount, getStackAlign());
1769     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1770     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1771     Register SPReg = MFI->getStackPtrOffsetReg();
1772 
1773     Amount *= getScratchScaleFactor(ST);
1774     if (IsDestroy)
1775       Amount = -Amount;
1776     auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1777         .addReg(SPReg)
1778         .addImm(Amount);
1779     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1780   } else if (CalleePopAmount != 0) {
1781     llvm_unreachable("is this used?");
1782   }
1783 
1784   return MBB.erase(I);
1785 }
1786 
1787 /// Returns true if the frame will require a reference to the stack pointer.
1788 ///
1789 /// This is the set of conditions common to setting up the stack pointer in a
1790 /// kernel, and for using a frame pointer in a callable function.
1791 ///
1792 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1793 /// references SP.
1794 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1795   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1796 }
1797 
1798 // The FP for kernels is always known 0, so we never really need to setup an
1799 // explicit register for it. However, DisableFramePointerElim will force us to
1800 // use a register for it.
1801 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1802   const MachineFrameInfo &MFI = MF.getFrameInfo();
1803 
1804   // For entry & chain functions we can use an immediate offset in most cases,
1805   // so the presence of calls doesn't imply we need a distinct frame pointer.
1806   if (MFI.hasCalls() &&
1807       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1808       !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
1809     // All offsets are unsigned, so need to be addressed in the same direction
1810     // as stack growth.
1811 
1812     // FIXME: This function is pretty broken, since it can be called before the
1813     // frame layout is determined or CSR spills are inserted.
1814     return MFI.getStackSize() != 0;
1815   }
1816 
1817   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1818          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1819              MF) ||
1820          MF.getTarget().Options.DisableFramePointerElim(MF);
1821 }
1822 
1823 // This is essentially a reduced version of hasFP for entry functions. Since the
1824 // stack pointer is known 0 on entry to kernels, we never really need an FP
1825 // register. We may need to initialize the stack pointer depending on the frame
1826 // properties, which logically overlaps many of the cases where an ordinary
1827 // function would require an FP.
1828 // Also used for chain functions. While not technically entry functions, chain
1829 // functions may need to set up a stack pointer in some situations.
1830 bool SIFrameLowering::requiresStackPointerReference(
1831     const MachineFunction &MF) const {
1832   // Callable functions always require a stack pointer reference.
1833   assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
1834           MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
1835          "only expected to call this for entry points and chain functions");
1836 
1837   const MachineFrameInfo &MFI = MF.getFrameInfo();
1838 
1839   // Entry points ordinarily don't need to initialize SP. We have to set it up
1840   // for callees if there are any. Also note tail calls are impossible/don't
1841   // make any sense for kernels.
1842   if (MFI.hasCalls())
1843     return true;
1844 
1845   // We still need to initialize the SP if we're doing anything weird that
1846   // references the SP, like variable sized stack objects.
1847   return frameTriviallyRequiresSP(MFI);
1848 }
1849