xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LiveRegUnits.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "frame-info"
22 
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24   "amdgpu-spill-vgpr-to-agpr",
25   cl::desc("Enable spilling VGPRs to AGPRs"),
26   cl::ReallyHidden,
27   cl::init(true));
28 
29 // Find a register matching \p RC from \p LiveUnits which is unused and
30 // available throughout the function. On failure, returns AMDGPU::NoRegister.
31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32 // MCRegisters. This should reduce the number of iterations and avoid redundant
33 // checking.
34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35                                      const LiveRegUnits &LiveUnits,
36                                      const TargetRegisterClass &RC) {
37   for (MCRegister Reg : RC) {
38     if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39         !MRI.isReserved(Reg))
40       return Reg;
41   }
42   return MCRegister();
43 }
44 
45 // Find a scratch register that we can use in the prologue. We avoid using
46 // callee-save registers since they may appear to be free when this is called
47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
48 // when this is called from emitPrologue.
49 static MCRegister findScratchNonCalleeSaveRegister(
50     MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51     const TargetRegisterClass &RC, bool Unused = false) {
52   // Mark callee saved registers as used so we will not choose them.
53   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54   for (unsigned i = 0; CSRegs[i]; ++i)
55     LiveUnits.addReg(CSRegs[i]);
56 
57   // We are looking for a register that can be used throughout the entire
58   // function, so any use is unacceptable.
59   if (Unused)
60     return findUnusedRegister(MRI, LiveUnits, RC);
61 
62   for (MCRegister Reg : RC) {
63     if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64       return Reg;
65   }
66 
67   return MCRegister();
68 }
69 
70 /// Query target location for spilling SGPRs
71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs
72 static void getVGPRSpillLaneOrTempRegister(
73     MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74     const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75     bool IncludeScratchCopy = true) {
76   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78 
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81   unsigned Size = TRI->getSpillSize(RC);
82   Align Alignment = TRI->getSpillAlign(RC);
83 
84   // We need to save and restore the given SGPR.
85 
86   Register ScratchSGPR;
87   // 1: Try to save the given register into an unused scratch SGPR. The
88   // LiveUnits should have all the callee saved registers marked as used. For
89   // certain cases we skip copy to scratch SGPR.
90   if (IncludeScratchCopy)
91     ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92 
93   if (!ScratchSGPR) {
94     int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
95                                          TargetStackID::SGPRSpill);
96 
97     if (TRI->spillSGPRToVGPR() &&
98         MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
99       // 2: There's no free lane to spill, and no free register to save the
100       // SGPR, so we're forced to take another VGPR to use for the spill.
101       MFI->addToPrologEpilogSGPRSpills(
102           SGPR, PrologEpilogSGPRSaveRestoreInfo(
103                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
104 
105       LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
106                  dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
107                         << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
108                         << '\n';);
109     } else {
110       // Remove dead <FI> index
111       MF.getFrameInfo().RemoveStackObject(FI);
112       // 3: If all else fails, spill the register to memory.
113       FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
114       MFI->addToPrologEpilogSGPRSpills(
115           SGPR,
116           PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
117       LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
118                         << printReg(SGPR, TRI) << '\n');
119     }
120   } else {
121     MFI->addToPrologEpilogSGPRSpills(
122         SGPR, PrologEpilogSGPRSaveRestoreInfo(
123                   SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
124     LiveUnits.addReg(ScratchSGPR);
125     LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
126                       << printReg(ScratchSGPR, TRI) << '\n');
127   }
128 }
129 
130 // We need to specially emit stack operations here because a different frame
131 // register is used than in the rest of the function, as getFrameRegister would
132 // use.
133 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
134                              const SIMachineFunctionInfo &FuncInfo,
135                              LiveRegUnits &LiveUnits, MachineFunction &MF,
136                              MachineBasicBlock &MBB,
137                              MachineBasicBlock::iterator I, const DebugLoc &DL,
138                              Register SpillReg, int FI, Register FrameReg,
139                              int64_t DwordOff = 0) {
140   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
141                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
142 
143   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
144   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
145   MachineMemOperand *MMO = MF.getMachineMemOperand(
146       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
147       FrameInfo.getObjectAlign(FI));
148   LiveUnits.addReg(SpillReg);
149   bool IsKill = !MBB.isLiveIn(SpillReg);
150   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
151                           DwordOff, MMO, nullptr, &LiveUnits);
152   if (IsKill)
153     LiveUnits.removeReg(SpillReg);
154 }
155 
156 static void buildEpilogRestore(const GCNSubtarget &ST,
157                                const SIRegisterInfo &TRI,
158                                const SIMachineFunctionInfo &FuncInfo,
159                                LiveRegUnits &LiveUnits, MachineFunction &MF,
160                                MachineBasicBlock &MBB,
161                                MachineBasicBlock::iterator I,
162                                const DebugLoc &DL, Register SpillReg, int FI,
163                                Register FrameReg, int64_t DwordOff = 0) {
164   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
165                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
166 
167   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
168   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
169   MachineMemOperand *MMO = MF.getMachineMemOperand(
170       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
171       FrameInfo.getObjectAlign(FI));
172   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
173                           DwordOff, MMO, nullptr, &LiveUnits);
174 }
175 
176 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
177                         const DebugLoc &DL, const SIInstrInfo *TII,
178                         Register TargetReg) {
179   MachineFunction *MF = MBB.getParent();
180   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
181   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
182   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
183   Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
184   Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
185 
186   if (MFI->getGITPtrHigh() != 0xffffffff) {
187     BuildMI(MBB, I, DL, SMovB32, TargetHi)
188         .addImm(MFI->getGITPtrHigh())
189         .addReg(TargetReg, RegState::ImplicitDefine);
190   } else {
191     const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
192     BuildMI(MBB, I, DL, GetPC64, TargetReg);
193   }
194   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
195   MF->getRegInfo().addLiveIn(GitPtrLo);
196   MBB.addLiveIn(GitPtrLo);
197   BuildMI(MBB, I, DL, SMovB32, TargetLo)
198     .addReg(GitPtrLo);
199 }
200 
201 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
202                           const SIMachineFunctionInfo *FuncInfo,
203                           MachineFunction &MF, MachineBasicBlock &MBB,
204                           MachineBasicBlock::iterator MBBI, bool IsProlog) {
205   if (LiveUnits.empty()) {
206     LiveUnits.init(TRI);
207     if (IsProlog) {
208       LiveUnits.addLiveIns(MBB);
209     } else {
210       // In epilog.
211       LiveUnits.addLiveOuts(MBB);
212       LiveUnits.stepBackward(*MBBI);
213     }
214   }
215 }
216 
217 namespace llvm {
218 
219 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
220 // BP, etc. These spills are delayed until the current function's frame is
221 // finalized. For a given register, the builder uses the
222 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
223 class PrologEpilogSGPRSpillBuilder {
224   MachineBasicBlock::iterator MI;
225   MachineBasicBlock &MBB;
226   MachineFunction &MF;
227   const GCNSubtarget &ST;
228   MachineFrameInfo &MFI;
229   SIMachineFunctionInfo *FuncInfo;
230   const SIInstrInfo *TII;
231   const SIRegisterInfo &TRI;
232   Register SuperReg;
233   const PrologEpilogSGPRSaveRestoreInfo SI;
234   LiveRegUnits &LiveUnits;
235   const DebugLoc &DL;
236   Register FrameReg;
237   ArrayRef<int16_t> SplitParts;
238   unsigned NumSubRegs;
239   unsigned EltSize = 4;
240 
241   void saveToMemory(const int FI) const {
242     MachineRegisterInfo &MRI = MF.getRegInfo();
243     assert(!MFI.isDeadObjectIndex(FI));
244 
245     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
246 
247     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
248         MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
249     if (!TmpVGPR)
250       report_fatal_error("failed to find free scratch register");
251 
252     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
253       Register SubReg = NumSubRegs == 1
254                             ? SuperReg
255                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
256       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
257           .addReg(SubReg);
258 
259       buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
260                        FI, FrameReg, DwordOff);
261       DwordOff += 4;
262     }
263   }
264 
265   void saveToVGPRLane(const int FI) const {
266     assert(!MFI.isDeadObjectIndex(FI));
267 
268     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
269     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
270         FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
271     assert(Spill.size() == NumSubRegs);
272 
273     for (unsigned I = 0; I < NumSubRegs; ++I) {
274       Register SubReg = NumSubRegs == 1
275                             ? SuperReg
276                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
277       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
278               Spill[I].VGPR)
279           .addReg(SubReg)
280           .addImm(Spill[I].Lane)
281           .addReg(Spill[I].VGPR, RegState::Undef);
282     }
283   }
284 
285   void copyToScratchSGPR(Register DstReg) const {
286     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
287         .addReg(SuperReg)
288         .setMIFlag(MachineInstr::FrameSetup);
289   }
290 
291   void restoreFromMemory(const int FI) {
292     MachineRegisterInfo &MRI = MF.getRegInfo();
293 
294     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
295     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
296         MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
297     if (!TmpVGPR)
298       report_fatal_error("failed to find free scratch register");
299 
300     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
301       Register SubReg = NumSubRegs == 1
302                             ? SuperReg
303                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
304 
305       buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
306                          TmpVGPR, FI, FrameReg, DwordOff);
307       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
308           .addReg(TmpVGPR, RegState::Kill);
309       DwordOff += 4;
310     }
311   }
312 
313   void restoreFromVGPRLane(const int FI) {
314     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
315     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
316         FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
317     assert(Spill.size() == NumSubRegs);
318 
319     for (unsigned I = 0; I < NumSubRegs; ++I) {
320       Register SubReg = NumSubRegs == 1
321                             ? SuperReg
322                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
323       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
324           .addReg(Spill[I].VGPR)
325           .addImm(Spill[I].Lane);
326     }
327   }
328 
329   void copyFromScratchSGPR(Register SrcReg) const {
330     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
331         .addReg(SrcReg)
332         .setMIFlag(MachineInstr::FrameDestroy);
333   }
334 
335 public:
336   PrologEpilogSGPRSpillBuilder(Register Reg,
337                                const PrologEpilogSGPRSaveRestoreInfo SI,
338                                MachineBasicBlock &MBB,
339                                MachineBasicBlock::iterator MI,
340                                const DebugLoc &DL, const SIInstrInfo *TII,
341                                const SIRegisterInfo &TRI,
342                                LiveRegUnits &LiveUnits, Register FrameReg)
343       : MI(MI), MBB(MBB), MF(*MBB.getParent()),
344         ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
345         FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
346         SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
347         FrameReg(FrameReg) {
348     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
349     SplitParts = TRI.getRegSplitParts(RC, EltSize);
350     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
351 
352     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
353   }
354 
355   void save() {
356     switch (SI.getKind()) {
357     case SGPRSaveKind::SPILL_TO_MEM:
358       return saveToMemory(SI.getIndex());
359     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
360       return saveToVGPRLane(SI.getIndex());
361     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
362       return copyToScratchSGPR(SI.getReg());
363     }
364   }
365 
366   void restore() {
367     switch (SI.getKind()) {
368     case SGPRSaveKind::SPILL_TO_MEM:
369       return restoreFromMemory(SI.getIndex());
370     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
371       return restoreFromVGPRLane(SI.getIndex());
372     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
373       return copyFromScratchSGPR(SI.getReg());
374     }
375   }
376 };
377 
378 } // namespace llvm
379 
380 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
381 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
382     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
383     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
384   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
385   const SIInstrInfo *TII = ST.getInstrInfo();
386   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
387   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
388 
389   // We don't need this if we only have spills since there is no user facing
390   // scratch.
391 
392   // TODO: If we know we don't have flat instructions earlier, we can omit
393   // this from the input registers.
394   //
395   // TODO: We only need to know if we access scratch space through a flat
396   // pointer. Because we only detect if flat instructions are used at all,
397   // this will be used more often than necessary on VI.
398 
399   Register FlatScrInitLo;
400   Register FlatScrInitHi;
401 
402   if (ST.isAmdPalOS()) {
403     // Extract the scratch offset from the descriptor in the GIT
404     LiveRegUnits LiveUnits;
405     LiveUnits.init(*TRI);
406     LiveUnits.addLiveIns(MBB);
407 
408     // Find unused reg to load flat scratch init into
409     MachineRegisterInfo &MRI = MF.getRegInfo();
410     Register FlatScrInit = AMDGPU::NoRegister;
411     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
412     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
413     AllSGPR64s = AllSGPR64s.slice(
414         std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
415     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
416     for (MCPhysReg Reg : AllSGPR64s) {
417       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
418           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
419         FlatScrInit = Reg;
420         break;
421       }
422     }
423     assert(FlatScrInit && "Failed to find free register for scratch init");
424 
425     FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
426     FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
427 
428     buildGitPtr(MBB, I, DL, TII, FlatScrInit);
429 
430     // We now have the GIT ptr - now get the scratch descriptor from the entry
431     // at offset 0 (or offset 16 for a compute shader).
432     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
433     const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
434     auto *MMO = MF.getMachineMemOperand(
435         PtrInfo,
436         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
437             MachineMemOperand::MODereferenceable,
438         8, Align(4));
439     unsigned Offset =
440         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
441     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
442     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
443     BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
444         .addReg(FlatScrInit)
445         .addImm(EncodedOffset) // offset
446         .addImm(0)             // cpol
447         .addMemOperand(MMO);
448 
449     // Mask the offset in [47:0] of the descriptor
450     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
451     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
452         .addReg(FlatScrInitHi)
453         .addImm(0xffff);
454     And->getOperand(3).setIsDead(); // Mark SCC as dead.
455   } else {
456     Register FlatScratchInitReg =
457         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
458     assert(FlatScratchInitReg);
459 
460     MachineRegisterInfo &MRI = MF.getRegInfo();
461     MRI.addLiveIn(FlatScratchInitReg);
462     MBB.addLiveIn(FlatScratchInitReg);
463 
464     FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
465     FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
466   }
467 
468   // Do a 64-bit pointer add.
469   if (ST.flatScratchIsPointer()) {
470     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
471       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
472         .addReg(FlatScrInitLo)
473         .addReg(ScratchWaveOffsetReg);
474       auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
475                           FlatScrInitHi)
476         .addReg(FlatScrInitHi)
477         .addImm(0);
478       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
479 
480       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
481         addReg(FlatScrInitLo).
482         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
483                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
484       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
485         addReg(FlatScrInitHi).
486         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
487                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
488       return;
489     }
490 
491     // For GFX9.
492     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493       .addReg(FlatScrInitLo)
494       .addReg(ScratchWaveOffsetReg);
495     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496                         AMDGPU::FLAT_SCR_HI)
497       .addReg(FlatScrInitHi)
498       .addImm(0);
499     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
500 
501     return;
502   }
503 
504   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505 
506   // Copy the size in bytes.
507   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508     .addReg(FlatScrInitHi, RegState::Kill);
509 
510   // Add wave offset in bytes to private base offset.
511   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513       .addReg(FlatScrInitLo)
514       .addReg(ScratchWaveOffsetReg);
515 
516   // Convert offset to 256-byte units.
517   auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518                       AMDGPU::FLAT_SCR_HI)
519     .addReg(FlatScrInitLo, RegState::Kill)
520     .addImm(8);
521   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
522 }
523 
524 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525 // memory. They should have been removed by now.
526 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
527   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528        I != E; ++I) {
529     if (!MFI.isDeadObjectIndex(I))
530       return false;
531   }
532 
533   return true;
534 }
535 
536 // Shift down registers reserved for the scratch RSRC.
537 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538     MachineFunction &MF) const {
539 
540   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
541   const SIInstrInfo *TII = ST.getInstrInfo();
542   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
543   MachineRegisterInfo &MRI = MF.getRegInfo();
544   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
545 
546   assert(MFI->isEntryFunction());
547 
548   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549 
550   if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
551                           allStackObjectsAreDead(MF.getFrameInfo())))
552     return Register();
553 
554   if (ST.hasSGPRInitBug() ||
555       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556     return ScratchRsrcReg;
557 
558   // We reserved the last registers for this. Shift it down to the end of those
559   // which were actually used.
560   //
561   // FIXME: It might be safer to use a pseudoregister before replacement.
562 
563   // FIXME: We should be able to eliminate unused input registers. We only
564   // cannot do this for the resources required for scratch access. For now we
565   // skip over user SGPRs and may leave unused holes.
566 
567   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
568   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
570 
571   // Skip the last N reserved elements because they should have already been
572   // reserved for VCC etc.
573   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574   for (MCPhysReg Reg : AllSGPR128s) {
575     // Pick the first unallocated one. Make sure we don't clobber the other
576     // reserved input we needed. Also for PAL, make sure we don't clobber
577     // the GIT pointer passed in SGPR0 or SGPR8.
578     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
579         (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580       MRI.replaceRegWith(ScratchRsrcReg, Reg);
581       MFI->setScratchRSrcReg(Reg);
582       return Reg;
583     }
584   }
585 
586   return ScratchRsrcReg;
587 }
588 
589 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
590   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
591 }
592 
593 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
594                                                 MachineBasicBlock &MBB) const {
595   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
596 
597   // FIXME: If we only have SGPR spills, we won't actually be using scratch
598   // memory since these spill to VGPRs. We should be cleaning up these unused
599   // SGPR spill frame indices somewhere.
600 
601   // FIXME: We still have implicit uses on SGPR spill instructions in case they
602   // need to spill to vector memory. It's likely that will not happen, but at
603   // this point it appears we need the setup. This part of the prolog should be
604   // emitted after frame indices are eliminated.
605 
606   // FIXME: Remove all of the isPhysRegUsed checks
607 
608   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
609   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
610   const SIInstrInfo *TII = ST.getInstrInfo();
611   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
612   MachineRegisterInfo &MRI = MF.getRegInfo();
613   const Function &F = MF.getFunction();
614   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
615 
616   assert(MFI->isEntryFunction());
617 
618   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
619       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
620 
621   // We need to do the replacement of the private segment buffer register even
622   // if there are no stack objects. There could be stores to undef or a
623   // constant without an associated object.
624   //
625   // This will return `Register()` in cases where there are no actual
626   // uses of the SRSRC.
627   Register ScratchRsrcReg;
628   if (!ST.enableFlatScratch())
629     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
630 
631   // Make the selected register live throughout the function.
632   if (ScratchRsrcReg) {
633     for (MachineBasicBlock &OtherBB : MF) {
634       if (&OtherBB != &MBB) {
635         OtherBB.addLiveIn(ScratchRsrcReg);
636       }
637     }
638   }
639 
640   // Now that we have fixed the reserved SRSRC we need to locate the
641   // (potentially) preloaded SRSRC.
642   Register PreloadedScratchRsrcReg;
643   if (ST.isAmdHsaOrMesa(F)) {
644     PreloadedScratchRsrcReg =
645         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
646     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
647       // We added live-ins during argument lowering, but since they were not
648       // used they were deleted. We're adding the uses now, so add them back.
649       MRI.addLiveIn(PreloadedScratchRsrcReg);
650       MBB.addLiveIn(PreloadedScratchRsrcReg);
651     }
652   }
653 
654   // Debug location must be unknown since the first debug location is used to
655   // determine the end of the prologue.
656   DebugLoc DL;
657   MachineBasicBlock::iterator I = MBB.begin();
658 
659   // We found the SRSRC first because it needs four registers and has an
660   // alignment requirement. If the SRSRC that we found is clobbering with
661   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
662   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
663   // wave offset to a free SGPR.
664   Register ScratchWaveOffsetReg;
665   if (PreloadedScratchWaveOffsetReg &&
666       TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
667     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
668     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
669     AllSGPRs = AllSGPRs.slice(
670         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
671     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
672     for (MCPhysReg Reg : AllSGPRs) {
673       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
674           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
675         ScratchWaveOffsetReg = Reg;
676         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
677             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
678         break;
679       }
680     }
681   } else {
682     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
683   }
684   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
685 
686   if (requiresStackPointerReference(MF)) {
687     Register SPReg = MFI->getStackPtrOffsetReg();
688     assert(SPReg != AMDGPU::SP_REG);
689     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
690         .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
691   }
692 
693   if (hasFP(MF)) {
694     Register FPReg = MFI->getFrameOffsetReg();
695     assert(FPReg != AMDGPU::FP_REG);
696     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
697   }
698 
699   bool NeedsFlatScratchInit =
700       MFI->getUserSGPRInfo().hasFlatScratchInit() &&
701       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
702        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
703 
704   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
705       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
706     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
707     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
708   }
709 
710   if (NeedsFlatScratchInit) {
711     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
712   }
713 
714   if (ScratchRsrcReg) {
715     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
716                                          PreloadedScratchRsrcReg,
717                                          ScratchRsrcReg, ScratchWaveOffsetReg);
718   }
719 }
720 
721 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
722 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
723     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
724     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
725     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
726 
727   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
728   const SIInstrInfo *TII = ST.getInstrInfo();
729   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
730   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
731   const Function &Fn = MF.getFunction();
732 
733   if (ST.isAmdPalOS()) {
734     // The pointer to the GIT is formed from the offset passed in and either
735     // the amdgpu-git-ptr-high function attribute or the top part of the PC
736     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
737     Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
738 
739     buildGitPtr(MBB, I, DL, TII, Rsrc01);
740 
741     // We now have the GIT ptr - now get the scratch descriptor from the entry
742     // at offset 0 (or offset 16 for a compute shader).
743     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
744     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
745     auto MMO = MF.getMachineMemOperand(PtrInfo,
746                                        MachineMemOperand::MOLoad |
747                                            MachineMemOperand::MOInvariant |
748                                            MachineMemOperand::MODereferenceable,
749                                        16, Align(4));
750     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
751     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
752     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
753     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
754       .addReg(Rsrc01)
755       .addImm(EncodedOffset) // offset
756       .addImm(0) // cpol
757       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
758       .addMemOperand(MMO);
759 
760     // The driver will always set the SRD for wave 64 (bits 118:117 of
761     // descriptor / bits 22:21 of third sub-reg will be 0b11)
762     // If the shader is actually wave32 we have to modify the const_index_stride
763     // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
764     // reason the driver does this is that there can be cases where it presents
765     // 2 shaders with different wave size (e.g. VsFs).
766     // TODO: convert to using SCRATCH instructions or multiple SRD buffers
767     if (ST.isWave32()) {
768       const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
769       BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
770           .addImm(21)
771           .addReg(Rsrc03);
772     }
773   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
774     assert(!ST.isAmdHsaOrMesa(Fn));
775     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
776 
777     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
778     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
779 
780     // Use relocations to get the pointer, and setup the other bits manually.
781     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
782 
783     if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
784       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
785 
786       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
787         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
788 
789         BuildMI(MBB, I, DL, Mov64, Rsrc01)
790           .addReg(MFI->getImplicitBufferPtrUserSGPR())
791           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
792       } else {
793         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
794 
795         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
796         auto MMO = MF.getMachineMemOperand(
797             PtrInfo,
798             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
799                 MachineMemOperand::MODereferenceable,
800             8, Align(4));
801         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
802           .addReg(MFI->getImplicitBufferPtrUserSGPR())
803           .addImm(0) // offset
804           .addImm(0) // cpol
805           .addMemOperand(MMO)
806           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
807 
808         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
809         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
810       }
811     } else {
812       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
813       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
814 
815       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
816         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
817         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
818 
819       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
820         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
821         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
822     }
823 
824     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
825       .addImm(Rsrc23 & 0xffffffff)
826       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
827 
828     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
829       .addImm(Rsrc23 >> 32)
830       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
831   } else if (ST.isAmdHsaOrMesa(Fn)) {
832     assert(PreloadedScratchRsrcReg);
833 
834     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
835       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
836           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
837     }
838   }
839 
840   // Add the scratch wave offset into the scratch RSRC.
841   //
842   // We only want to update the first 48 bits, which is the base address
843   // pointer, without touching the adjacent 16 bits of flags. We know this add
844   // cannot carry-out from bit 47, otherwise the scratch allocation would be
845   // impossible to fit in the 48-bit global address space.
846   //
847   // TODO: Evaluate if it is better to just construct an SRD using the flat
848   // scratch init and some constants rather than update the one we are passed.
849   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
850   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
851 
852   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
853   // the kernel body via inreg arguments.
854   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
855       .addReg(ScratchRsrcSub0)
856       .addReg(ScratchWaveOffsetReg)
857       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
858   auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
859       .addReg(ScratchRsrcSub1)
860       .addImm(0)
861       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
862   Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
863 }
864 
865 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
866   switch (ID) {
867   case TargetStackID::Default:
868   case TargetStackID::NoAlloc:
869   case TargetStackID::SGPRSpill:
870     return true;
871   case TargetStackID::ScalableVector:
872   case TargetStackID::WasmLocal:
873     return false;
874   }
875   llvm_unreachable("Invalid TargetStackID::Value");
876 }
877 
878 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
879 // Otherwise, activate all lanes. It returns the saved exec.
880 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
881                                      MachineFunction &MF,
882                                      MachineBasicBlock &MBB,
883                                      MachineBasicBlock::iterator MBBI,
884                                      const DebugLoc &DL, bool IsProlog,
885                                      bool EnableInactiveLanes) {
886   Register ScratchExecCopy;
887   MachineRegisterInfo &MRI = MF.getRegInfo();
888   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
889   const SIInstrInfo *TII = ST.getInstrInfo();
890   const SIRegisterInfo &TRI = TII->getRegisterInfo();
891   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
892 
893   initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
894 
895   ScratchExecCopy = findScratchNonCalleeSaveRegister(
896       MRI, LiveUnits, *TRI.getWaveMaskRegClass());
897   if (!ScratchExecCopy)
898     report_fatal_error("failed to find free scratch register");
899 
900   LiveUnits.addReg(ScratchExecCopy);
901 
902   const unsigned SaveExecOpc =
903       ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
904                                            : AMDGPU::S_OR_SAVEEXEC_B32)
905                     : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
906                                            : AMDGPU::S_OR_SAVEEXEC_B64);
907   auto SaveExec =
908       BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
909   SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
910 
911   return ScratchExecCopy;
912 }
913 
914 void SIFrameLowering::emitCSRSpillStores(
915     MachineFunction &MF, MachineBasicBlock &MBB,
916     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
917     Register FrameReg, Register FramePtrRegScratchCopy) const {
918   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
919   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
920   const SIInstrInfo *TII = ST.getInstrInfo();
921   const SIRegisterInfo &TRI = TII->getRegisterInfo();
922 
923   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
924   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
925   // might end up flipping the EXEC bits twice.
926   Register ScratchExecCopy;
927   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
928   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
929   if (!WWMScratchRegs.empty())
930     ScratchExecCopy =
931         buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
932                              /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
933 
934   auto StoreWWMRegisters =
935       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
936         for (const auto &Reg : WWMRegs) {
937           Register VGPR = Reg.first;
938           int FI = Reg.second;
939           buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
940                            VGPR, FI, FrameReg);
941         }
942       };
943 
944   StoreWWMRegisters(WWMScratchRegs);
945   if (!WWMCalleeSavedRegs.empty()) {
946     if (ScratchExecCopy) {
947       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
948       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
949     } else {
950       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
951                                              /*IsProlog*/ true,
952                                              /*EnableInactiveLanes*/ false);
953     }
954   }
955 
956   StoreWWMRegisters(WWMCalleeSavedRegs);
957   if (ScratchExecCopy) {
958     // FIXME: Split block and make terminator.
959     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
960     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
961         .addReg(ScratchExecCopy, RegState::Kill);
962     LiveUnits.addReg(ScratchExecCopy);
963   }
964 
965   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
966 
967   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
968     // Special handle FP spill:
969     // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
970     // Otherwise, FP has been moved to a temporary register and spill it
971     // instead.
972     Register Reg =
973         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
974     if (!Reg)
975       continue;
976 
977     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
978                                     LiveUnits, FrameReg);
979     SB.save();
980   }
981 
982   // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
983   // such scratch registers live throughout the function.
984   SmallVector<Register, 1> ScratchSGPRs;
985   FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
986   if (!ScratchSGPRs.empty()) {
987     for (MachineBasicBlock &MBB : MF) {
988       for (MCPhysReg Reg : ScratchSGPRs)
989         MBB.addLiveIn(Reg);
990 
991       MBB.sortUniqueLiveIns();
992     }
993     if (!LiveUnits.empty()) {
994       for (MCPhysReg Reg : ScratchSGPRs)
995         LiveUnits.addReg(Reg);
996     }
997   }
998 }
999 
1000 void SIFrameLowering::emitCSRSpillRestores(
1001     MachineFunction &MF, MachineBasicBlock &MBB,
1002     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1003     Register FrameReg, Register FramePtrRegScratchCopy) const {
1004   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1005   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1006   const SIInstrInfo *TII = ST.getInstrInfo();
1007   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1008   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1009 
1010   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1011     // Special handle FP restore:
1012     // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1013     // the FP value to a temporary register. The frame pointer should be
1014     // overwritten only at the end when all other spills are restored from
1015     // current frame.
1016     Register Reg =
1017         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1018     if (!Reg)
1019       continue;
1020 
1021     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1022                                     LiveUnits, FrameReg);
1023     SB.restore();
1024   }
1025 
1026   // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1027   // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1028   // this, we might end up flipping the EXEC bits twice.
1029   Register ScratchExecCopy;
1030   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1031   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1032   if (!WWMScratchRegs.empty())
1033     ScratchExecCopy =
1034         buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1035                              /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1036 
1037   auto RestoreWWMRegisters =
1038       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1039         for (const auto &Reg : WWMRegs) {
1040           Register VGPR = Reg.first;
1041           int FI = Reg.second;
1042           buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1043                              VGPR, FI, FrameReg);
1044         }
1045       };
1046 
1047   RestoreWWMRegisters(WWMScratchRegs);
1048   if (!WWMCalleeSavedRegs.empty()) {
1049     if (ScratchExecCopy) {
1050       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1051       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1052     } else {
1053       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1054                                              /*IsProlog*/ false,
1055                                              /*EnableInactiveLanes*/ false);
1056     }
1057   }
1058 
1059   RestoreWWMRegisters(WWMCalleeSavedRegs);
1060   if (ScratchExecCopy) {
1061     // FIXME: Split block and make terminator.
1062     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1063     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1064         .addReg(ScratchExecCopy, RegState::Kill);
1065   }
1066 }
1067 
1068 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1069                                    MachineBasicBlock &MBB) const {
1070   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1071   if (FuncInfo->isEntryFunction()) {
1072     emitEntryFunctionPrologue(MF, MBB);
1073     return;
1074   }
1075 
1076   MachineFrameInfo &MFI = MF.getFrameInfo();
1077   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078   const SIInstrInfo *TII = ST.getInstrInfo();
1079   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1080   MachineRegisterInfo &MRI = MF.getRegInfo();
1081 
1082   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1083   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1084   Register BasePtrReg =
1085       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1086   LiveRegUnits LiveUnits;
1087 
1088   MachineBasicBlock::iterator MBBI = MBB.begin();
1089   // DebugLoc must be unknown since the first instruction with DebugLoc is used
1090   // to determine the end of the prologue.
1091   DebugLoc DL;
1092 
1093   if (FuncInfo->isChainFunction()) {
1094     // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1095     // are free to set one up if they need it.
1096     bool UseSP = requiresStackPointerReference(MF);
1097     if (UseSP) {
1098       assert(StackPtrReg != AMDGPU::SP_REG);
1099 
1100       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1101           .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1102     }
1103   }
1104 
1105   bool HasFP = false;
1106   bool HasBP = false;
1107   uint32_t NumBytes = MFI.getStackSize();
1108   uint32_t RoundedSize = NumBytes;
1109 
1110   if (TRI.hasStackRealignment(MF))
1111     HasFP = true;
1112 
1113   Register FramePtrRegScratchCopy;
1114   if (!HasFP && !hasFP(MF)) {
1115     // Emit the CSR spill stores with SP base register.
1116     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1117                        FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1118                        FramePtrRegScratchCopy);
1119   } else {
1120     // CSR spill stores will use FP as base register.
1121     Register SGPRForFPSaveRestoreCopy =
1122         FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1123 
1124     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1125     if (SGPRForFPSaveRestoreCopy) {
1126       // Copy FP to the scratch register now and emit the CFI entry. It avoids
1127       // the extra FP copy needed in the other two cases when FP is spilled to
1128       // memory or to a VGPR lane.
1129       PrologEpilogSGPRSpillBuilder SB(
1130           FramePtrReg,
1131           FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1132           DL, TII, TRI, LiveUnits, FramePtrReg);
1133       SB.save();
1134       LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1135     } else {
1136       // Copy FP into a new scratch register so that its previous value can be
1137       // spilled after setting up the new frame.
1138       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1139           MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1140       if (!FramePtrRegScratchCopy)
1141         report_fatal_error("failed to find free scratch register");
1142 
1143       LiveUnits.addReg(FramePtrRegScratchCopy);
1144       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1145           .addReg(FramePtrReg);
1146     }
1147   }
1148 
1149   if (HasFP) {
1150     const unsigned Alignment = MFI.getMaxAlign().value();
1151 
1152     RoundedSize += Alignment;
1153     if (LiveUnits.empty()) {
1154       LiveUnits.init(TRI);
1155       LiveUnits.addLiveIns(MBB);
1156     }
1157 
1158     // s_add_i32 s33, s32, NumBytes
1159     // s_and_b32 s33, s33, 0b111...0000
1160     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1161         .addReg(StackPtrReg)
1162         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1163         .setMIFlag(MachineInstr::FrameSetup);
1164     auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1165         .addReg(FramePtrReg, RegState::Kill)
1166         .addImm(-Alignment * getScratchScaleFactor(ST))
1167         .setMIFlag(MachineInstr::FrameSetup);
1168     And->getOperand(3).setIsDead(); // Mark SCC as dead.
1169     FuncInfo->setIsStackRealigned(true);
1170   } else if ((HasFP = hasFP(MF))) {
1171     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1172         .addReg(StackPtrReg)
1173         .setMIFlag(MachineInstr::FrameSetup);
1174   }
1175 
1176   // If FP is used, emit the CSR spills with FP base register.
1177   if (HasFP) {
1178     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1179                        FramePtrRegScratchCopy);
1180     if (FramePtrRegScratchCopy)
1181       LiveUnits.removeReg(FramePtrRegScratchCopy);
1182   }
1183 
1184   // If we need a base pointer, set it up here. It's whatever the value of
1185   // the stack pointer is at this point. Any variable size objects will be
1186   // allocated after this, so we can still use the base pointer to reference
1187   // the incoming arguments.
1188   if ((HasBP = TRI.hasBasePointer(MF))) {
1189     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1190         .addReg(StackPtrReg)
1191         .setMIFlag(MachineInstr::FrameSetup);
1192   }
1193 
1194   if (HasFP && RoundedSize != 0) {
1195     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1196         .addReg(StackPtrReg)
1197         .addImm(RoundedSize * getScratchScaleFactor(ST))
1198         .setMIFlag(MachineInstr::FrameSetup);
1199     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1200   }
1201 
1202   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1203   (void)FPSaved;
1204   assert((!HasFP || FPSaved) &&
1205          "Needed to save FP but didn't save it anywhere");
1206 
1207   // If we allow spilling to AGPRs we may have saved FP but then spill
1208   // everything into AGPRs instead of the stack.
1209   assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1210          "Saved FP but didn't need it");
1211 
1212   bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1213   (void)BPSaved;
1214   assert((!HasBP || BPSaved) &&
1215          "Needed to save BP but didn't save it anywhere");
1216 
1217   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1218 }
1219 
1220 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1221                                    MachineBasicBlock &MBB) const {
1222   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1223   if (FuncInfo->isEntryFunction())
1224     return;
1225 
1226   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1227   const SIInstrInfo *TII = ST.getInstrInfo();
1228   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1229   MachineRegisterInfo &MRI = MF.getRegInfo();
1230   LiveRegUnits LiveUnits;
1231   // Get the insert location for the epilogue. If there were no terminators in
1232   // the block, get the last instruction.
1233   MachineBasicBlock::iterator MBBI = MBB.end();
1234   DebugLoc DL;
1235   if (!MBB.empty()) {
1236     MBBI = MBB.getLastNonDebugInstr();
1237     if (MBBI != MBB.end())
1238       DL = MBBI->getDebugLoc();
1239 
1240     MBBI = MBB.getFirstTerminator();
1241   }
1242 
1243   const MachineFrameInfo &MFI = MF.getFrameInfo();
1244   uint32_t NumBytes = MFI.getStackSize();
1245   uint32_t RoundedSize = FuncInfo->isStackRealigned()
1246                              ? NumBytes + MFI.getMaxAlign().value()
1247                              : NumBytes;
1248   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1249   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1250   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1251 
1252   Register FramePtrRegScratchCopy;
1253   Register SGPRForFPSaveRestoreCopy =
1254       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1255   if (FPSaved) {
1256     // CSR spill restores should use FP as base register. If
1257     // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1258     // into a new scratch register and copy to FP later when other registers are
1259     // restored from the current stack frame.
1260     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1261     if (SGPRForFPSaveRestoreCopy) {
1262       LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1263     } else {
1264       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1265           MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1266       if (!FramePtrRegScratchCopy)
1267         report_fatal_error("failed to find free scratch register");
1268 
1269       LiveUnits.addReg(FramePtrRegScratchCopy);
1270     }
1271 
1272     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1273                          FramePtrRegScratchCopy);
1274   }
1275 
1276   if (RoundedSize != 0 && hasFP(MF)) {
1277     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1278         .addReg(StackPtrReg)
1279         .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1280         .setMIFlag(MachineInstr::FrameDestroy);
1281     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1282   }
1283 
1284   if (FPSaved) {
1285     // Insert the copy to restore FP.
1286     Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1287                                                : FramePtrRegScratchCopy;
1288     MachineInstrBuilder MIB =
1289         BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1290             .addReg(SrcReg);
1291     if (SGPRForFPSaveRestoreCopy)
1292       MIB.setMIFlag(MachineInstr::FrameDestroy);
1293   } else {
1294     // Insert the CSR spill restores with SP as the base register.
1295     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1296                          FramePtrRegScratchCopy);
1297   }
1298 }
1299 
1300 #ifndef NDEBUG
1301 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1302   const MachineFrameInfo &MFI = MF.getFrameInfo();
1303   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1304   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1305        I != E; ++I) {
1306     if (!MFI.isDeadObjectIndex(I) &&
1307         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1308         !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1309       return false;
1310     }
1311   }
1312 
1313   return true;
1314 }
1315 #endif
1316 
1317 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1318                                                     int FI,
1319                                                     Register &FrameReg) const {
1320   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1321 
1322   FrameReg = RI->getFrameRegister(MF);
1323   return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1324 }
1325 
1326 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1327   MachineFunction &MF,
1328   RegScavenger *RS) const {
1329   MachineFrameInfo &MFI = MF.getFrameInfo();
1330 
1331   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1332   const SIInstrInfo *TII = ST.getInstrInfo();
1333   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1334   MachineRegisterInfo &MRI = MF.getRegInfo();
1335   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1336 
1337   // Allocate spill slots for WWM reserved VGPRs.
1338   // For chain functions, we only need to do this if we have calls to
1339   // llvm.amdgcn.cs.chain.
1340   bool IsChainWithoutCalls =
1341       FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1342   if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1343     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1344       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1345       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1346                                  TRI->getSpillAlign(*RC));
1347     }
1348   }
1349 
1350   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1351                                && EnableSpillVGPRToAGPR;
1352 
1353   if (SpillVGPRToAGPR) {
1354     // To track the spill frame indices handled in this pass.
1355     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1356     BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1357 
1358     bool SeenDbgInstr = false;
1359 
1360     for (MachineBasicBlock &MBB : MF) {
1361       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1362         int FrameIndex;
1363         if (MI.isDebugInstr())
1364           SeenDbgInstr = true;
1365 
1366         if (TII->isVGPRSpill(MI)) {
1367           // Try to eliminate stack used by VGPR spills before frame
1368           // finalization.
1369           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1370                                                      AMDGPU::OpName::vaddr);
1371           int FI = MI.getOperand(FIOp).getIndex();
1372           Register VReg =
1373             TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1374           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1375                                                 TRI->isAGPR(MRI, VReg))) {
1376             assert(RS != nullptr);
1377             RS->enterBasicBlockEnd(MBB);
1378             RS->backward(std::next(MI.getIterator()));
1379             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1380             SpillFIs.set(FI);
1381             continue;
1382           }
1383         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1384                    TII->isLoadFromStackSlot(MI, FrameIndex))
1385           if (!MFI.isFixedObjectIndex(FrameIndex))
1386             NonVGPRSpillFIs.set(FrameIndex);
1387       }
1388     }
1389 
1390     // Stack slot coloring may assign different objects to the same stack slot.
1391     // If not, then the VGPR to AGPR spill slot is dead.
1392     for (unsigned FI : SpillFIs.set_bits())
1393       if (!NonVGPRSpillFIs.test(FI))
1394         FuncInfo->setVGPRToAGPRSpillDead(FI);
1395 
1396     for (MachineBasicBlock &MBB : MF) {
1397       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1398         MBB.addLiveIn(Reg);
1399 
1400       for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1401         MBB.addLiveIn(Reg);
1402 
1403       MBB.sortUniqueLiveIns();
1404 
1405       if (!SpillFIs.empty() && SeenDbgInstr) {
1406         // FIXME: The dead frame indices are replaced with a null register from
1407         // the debug value instructions. We should instead, update it with the
1408         // correct register value. But not sure the register value alone is
1409         for (MachineInstr &MI : MBB) {
1410           if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1411               !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1412               SpillFIs[MI.getOperand(0).getIndex()]) {
1413             MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1414           }
1415         }
1416       }
1417     }
1418   }
1419 
1420   // At this point we've already allocated all spilled SGPRs to VGPRs if we
1421   // can. Any remaining SGPR spills will go to memory, so move them back to the
1422   // default stack.
1423   bool HaveSGPRToVMemSpill =
1424       FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1425   assert(allSGPRSpillsAreDead(MF) &&
1426          "SGPR spill should have been removed in SILowerSGPRSpills");
1427 
1428   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1429   // but currently hasNonSpillStackObjects is set only from source
1430   // allocas. Stack temps produced from legalization are not counted currently.
1431   if (!allStackObjectsAreDead(MFI)) {
1432     assert(RS && "RegScavenger required if spilling");
1433 
1434     // Add an emergency spill slot
1435     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1436 
1437     // If we are spilling SGPRs to memory with a large frame, we may need a
1438     // second VGPR emergency frame index.
1439     if (HaveSGPRToVMemSpill &&
1440         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1441       RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1442     }
1443   }
1444 }
1445 
1446 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1447     MachineFunction &MF, RegScavenger *RS) const {
1448   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1449   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1450   MachineRegisterInfo &MRI = MF.getRegInfo();
1451   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1452 
1453   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1454     // On gfx908, we had initially reserved highest available VGPR for AGPR
1455     // copy. Now since we are done with RA, check if there exist an unused VGPR
1456     // which is lower than the eariler reserved VGPR before RA. If one exist,
1457     // use it for AGPR copy instead of one reserved before RA.
1458     Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1459     Register UnusedLowVGPR =
1460         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1461     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1462                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1463       // Reserve this newly identified VGPR (for AGPR copy)
1464       // reserved registers should already be frozen at this point
1465       // so we can avoid calling MRI.freezeReservedRegs and just use
1466       // MRI.reserveReg
1467       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1468       MRI.reserveReg(UnusedLowVGPR, TRI);
1469     }
1470   }
1471   // We initally reserved the highest available SGPR pair for long branches
1472   // now, after RA, we shift down to a lower unused one if one exists
1473   Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1474   Register UnusedLowSGPR =
1475       TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1476   // If LongBranchReservedReg is null then we didn't find a long branch
1477   // and never reserved a register to begin with so there is nothing to
1478   // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1479   // register to use so just keep the original one we set.
1480   if (LongBranchReservedReg && UnusedLowSGPR) {
1481     FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1482     MRI.reserveReg(UnusedLowSGPR, TRI);
1483   }
1484 }
1485 
1486 // The special SGPR spills like the one needed for FP, BP or any reserved
1487 // registers delayed until frame lowering.
1488 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1489     MachineFunction &MF, BitVector &SavedVGPRs,
1490     bool NeedExecCopyReservedReg) const {
1491   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1492   MachineRegisterInfo &MRI = MF.getRegInfo();
1493   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1494   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1495   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1496   LiveRegUnits LiveUnits;
1497   LiveUnits.init(*TRI);
1498   // Initially mark callee saved registers as used so we will not choose them
1499   // while looking for scratch SGPRs.
1500   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1501   for (unsigned I = 0; CSRegs[I]; ++I)
1502     LiveUnits.addReg(CSRegs[I]);
1503 
1504   const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1505 
1506   if (NeedExecCopyReservedReg) {
1507     Register ReservedReg = MFI->getSGPRForEXECCopy();
1508     assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
1509     Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1510     if (UnusedScratchReg) {
1511       // If found any unused scratch SGPR, reserve the register itself for Exec
1512       // copy and there is no need for any spills in that case.
1513       MFI->setSGPRForEXECCopy(UnusedScratchReg);
1514       LiveUnits.addReg(UnusedScratchReg);
1515     } else {
1516       // Needs spill.
1517       assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
1518              "Re-reserving spill slot for EXEC copy register");
1519       getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC,
1520                                      /*IncludeScratchCopy=*/false);
1521     }
1522   }
1523 
1524   // hasFP only knows about stack objects that already exist. We're now
1525   // determining the stack slots that will be created, so we have to predict
1526   // them. Stack objects force FP usage with calls.
1527   //
1528   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1529   // don't want to report it here.
1530   //
1531   // FIXME: Is this really hasReservedCallFrame?
1532   const bool WillHaveFP =
1533       FrameInfo.hasCalls() &&
1534       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1535 
1536   if (WillHaveFP || hasFP(MF)) {
1537     Register FramePtrReg = MFI->getFrameOffsetReg();
1538     assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1539            "Re-reserving spill slot for FP");
1540     getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1541   }
1542 
1543   if (TRI->hasBasePointer(MF)) {
1544     Register BasePtrReg = TRI->getBaseRegister();
1545     assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1546            "Re-reserving spill slot for BP");
1547     getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1548   }
1549 }
1550 
1551 // Only report VGPRs to generic code.
1552 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1553                                            BitVector &SavedVGPRs,
1554                                            RegScavenger *RS) const {
1555   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1556 
1557   // If this is a function with the amdgpu_cs_chain[_preserve] calling
1558   // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1559   // we don't need to save and restore anything.
1560   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1561     return;
1562 
1563   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1564   if (MFI->isEntryFunction())
1565     return;
1566 
1567   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1568   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1569   const SIInstrInfo *TII = ST.getInstrInfo();
1570   bool NeedExecCopyReservedReg = false;
1571 
1572   MachineInstr *ReturnMI = nullptr;
1573   for (MachineBasicBlock &MBB : MF) {
1574     for (MachineInstr &MI : MBB) {
1575       // WRITELANE instructions used for SGPR spills can overwrite the inactive
1576       // lanes of VGPRs and callee must spill and restore them even if they are
1577       // marked Caller-saved.
1578 
1579       // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1580       // here would be a bad heuristic. A better way should be by calling
1581       // allocateWWMSpill during the regalloc pipeline whenever a physical
1582       // register is allocated for the intended virtual registers.
1583       if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1584         MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1585       else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1586         MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1587       else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1588         NeedExecCopyReservedReg = true;
1589       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1590                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1591                (MFI->isChainFunction() &&
1592                 TII->isChainCallOpcode(MI.getOpcode()))) {
1593         // We expect all return to be the same size.
1594         assert(!ReturnMI ||
1595                (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1596                 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1597         ReturnMI = &MI;
1598       }
1599     }
1600   }
1601 
1602   // Remove any VGPRs used in the return value because these do not need to be saved.
1603   // This prevents CSR restore from clobbering return VGPRs.
1604   if (ReturnMI) {
1605     for (auto &Op : ReturnMI->operands()) {
1606       if (Op.isReg())
1607         SavedVGPRs.reset(Op.getReg());
1608     }
1609   }
1610 
1611   // Ignore the SGPRs the default implementation found.
1612   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1613 
1614   // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1615   // In gfx908 there was do AGPR loads and stores and thus spilling also
1616   // require a temporary VGPR.
1617   if (!ST.hasGFX90AInsts())
1618     SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1619 
1620   determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1621 
1622   // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1623   // allow the default insertion to handle them.
1624   for (auto &Reg : MFI->getWWMSpills())
1625     SavedVGPRs.reset(Reg.first);
1626 
1627   // Mark all lane VGPRs as BB LiveIns.
1628   for (MachineBasicBlock &MBB : MF) {
1629     for (auto &Reg : MFI->getWWMSpills())
1630       MBB.addLiveIn(Reg.first);
1631 
1632     MBB.sortUniqueLiveIns();
1633   }
1634 }
1635 
1636 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1637                                                BitVector &SavedRegs,
1638                                                RegScavenger *RS) const {
1639   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1640   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1641   if (MFI->isEntryFunction())
1642     return;
1643 
1644   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1645   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1646 
1647   // The SP is specifically managed and we don't want extra spills of it.
1648   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1649 
1650   const BitVector AllSavedRegs = SavedRegs;
1651   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1652 
1653   // We have to anticipate introducing CSR VGPR spills or spill of caller
1654   // save VGPR reserved for SGPR spills as we now always create stack entry
1655   // for it, if we don't have any stack objects already, since we require a FP
1656   // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1657   // there are any SGPR spills. Whether they are CSR spills or otherwise.
1658   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1659   const bool WillHaveFP =
1660       FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1661 
1662   // FP will be specially managed like SP.
1663   if (WillHaveFP || hasFP(MF))
1664     SavedRegs.reset(MFI->getFrameOffsetReg());
1665 
1666   // Return address use with return instruction is hidden through the SI_RETURN
1667   // pseudo. Given that and since the IPRA computes actual register usage and
1668   // does not use CSR list, the clobbering of return address by function calls
1669   // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1670   // usage collection. This will ensure save/restore of return address happens
1671   // in those scenarios.
1672   const MachineRegisterInfo &MRI = MF.getRegInfo();
1673   Register RetAddrReg = TRI->getReturnAddressReg(MF);
1674   if (!MFI->isEntryFunction() &&
1675       (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1676     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1677     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1678   }
1679 }
1680 
1681 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1682     MachineFunction &MF, const TargetRegisterInfo *TRI,
1683     std::vector<CalleeSavedInfo> &CSI) const {
1684   if (CSI.empty())
1685     return true; // Early exit if no callee saved registers are modified!
1686 
1687   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1688   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1689   const SIRegisterInfo *RI = ST.getRegisterInfo();
1690   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1691   Register BasePtrReg = RI->getBaseRegister();
1692   Register SGPRForFPSaveRestoreCopy =
1693       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1694   Register SGPRForBPSaveRestoreCopy =
1695       FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1696   if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1697     return false;
1698 
1699   unsigned NumModifiedRegs = 0;
1700 
1701   if (SGPRForFPSaveRestoreCopy)
1702     NumModifiedRegs++;
1703   if (SGPRForBPSaveRestoreCopy)
1704     NumModifiedRegs++;
1705 
1706   for (auto &CS : CSI) {
1707     if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1708       CS.setDstReg(SGPRForFPSaveRestoreCopy);
1709       if (--NumModifiedRegs)
1710         break;
1711     } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1712       CS.setDstReg(SGPRForBPSaveRestoreCopy);
1713       if (--NumModifiedRegs)
1714         break;
1715     }
1716   }
1717 
1718   return false;
1719 }
1720 
1721 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1722   const MachineFunction &MF) const {
1723 
1724   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1725   const MachineFrameInfo &MFI = MF.getFrameInfo();
1726   const SIInstrInfo *TII = ST.getInstrInfo();
1727   uint64_t EstStackSize = MFI.estimateStackSize(MF);
1728   uint64_t MaxOffset = EstStackSize - 1;
1729 
1730   // We need the emergency stack slots to be allocated in range of the
1731   // MUBUF/flat scratch immediate offset from the base register, so assign these
1732   // first at the incoming SP position.
1733   //
1734   // TODO: We could try sorting the objects to find a hole in the first bytes
1735   // rather than allocating as close to possible. This could save a lot of space
1736   // on frames with alignment requirements.
1737   if (ST.enableFlatScratch()) {
1738     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1739                                SIInstrFlags::FlatScratch))
1740       return false;
1741   } else {
1742     if (TII->isLegalMUBUFImmOffset(MaxOffset))
1743       return false;
1744   }
1745 
1746   return true;
1747 }
1748 
1749 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1750   MachineFunction &MF,
1751   MachineBasicBlock &MBB,
1752   MachineBasicBlock::iterator I) const {
1753   int64_t Amount = I->getOperand(0).getImm();
1754   if (Amount == 0)
1755     return MBB.erase(I);
1756 
1757   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758   const SIInstrInfo *TII = ST.getInstrInfo();
1759   const DebugLoc &DL = I->getDebugLoc();
1760   unsigned Opc = I->getOpcode();
1761   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1762   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1763 
1764   if (!hasReservedCallFrame(MF)) {
1765     Amount = alignTo(Amount, getStackAlign());
1766     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1767     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1768     Register SPReg = MFI->getStackPtrOffsetReg();
1769 
1770     Amount *= getScratchScaleFactor(ST);
1771     if (IsDestroy)
1772       Amount = -Amount;
1773     auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1774         .addReg(SPReg)
1775         .addImm(Amount);
1776     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1777   } else if (CalleePopAmount != 0) {
1778     llvm_unreachable("is this used?");
1779   }
1780 
1781   return MBB.erase(I);
1782 }
1783 
1784 /// Returns true if the frame will require a reference to the stack pointer.
1785 ///
1786 /// This is the set of conditions common to setting up the stack pointer in a
1787 /// kernel, and for using a frame pointer in a callable function.
1788 ///
1789 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1790 /// references SP.
1791 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1792   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1793 }
1794 
1795 // The FP for kernels is always known 0, so we never really need to setup an
1796 // explicit register for it. However, DisableFramePointerElim will force us to
1797 // use a register for it.
1798 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1799   const MachineFrameInfo &MFI = MF.getFrameInfo();
1800 
1801   // For entry & chain functions we can use an immediate offset in most cases,
1802   // so the presence of calls doesn't imply we need a distinct frame pointer.
1803   if (MFI.hasCalls() &&
1804       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1805       !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
1806     // All offsets are unsigned, so need to be addressed in the same direction
1807     // as stack growth.
1808 
1809     // FIXME: This function is pretty broken, since it can be called before the
1810     // frame layout is determined or CSR spills are inserted.
1811     return MFI.getStackSize() != 0;
1812   }
1813 
1814   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1815          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1816              MF) ||
1817          MF.getTarget().Options.DisableFramePointerElim(MF);
1818 }
1819 
1820 // This is essentially a reduced version of hasFP for entry functions. Since the
1821 // stack pointer is known 0 on entry to kernels, we never really need an FP
1822 // register. We may need to initialize the stack pointer depending on the frame
1823 // properties, which logically overlaps many of the cases where an ordinary
1824 // function would require an FP.
1825 // Also used for chain functions. While not technically entry functions, chain
1826 // functions may need to set up a stack pointer in some situations.
1827 bool SIFrameLowering::requiresStackPointerReference(
1828     const MachineFunction &MF) const {
1829   // Callable functions always require a stack pointer reference.
1830   assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
1831           MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
1832          "only expected to call this for entry points and chain functions");
1833 
1834   const MachineFrameInfo &MFI = MF.getFrameInfo();
1835 
1836   // Entry points ordinarily don't need to initialize SP. We have to set it up
1837   // for callees if there are any. Also note tail calls are impossible/don't
1838   // make any sense for kernels.
1839   if (MFI.hasCalls())
1840     return true;
1841 
1842   // We still need to initialize the SP if we're doing anything weird that
1843   // references the SP, like variable sized stack objects.
1844   return frameTriviallyRequiresSP(MFI);
1845 }
1846