xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LiveRegUnits.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "frame-info"
22 
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24   "amdgpu-spill-vgpr-to-agpr",
25   cl::desc("Enable spilling VGPRs to AGPRs"),
26   cl::ReallyHidden,
27   cl::init(true));
28 
29 // Find a register matching \p RC from \p LiveUnits which is unused and
30 // available throughout the function. On failure, returns AMDGPU::NoRegister.
31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32 // MCRegisters. This should reduce the number of iterations and avoid redundant
33 // checking.
34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35                                      const LiveRegUnits &LiveUnits,
36                                      const TargetRegisterClass &RC) {
37   for (MCRegister Reg : RC) {
38     if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39         !MRI.isReserved(Reg))
40       return Reg;
41   }
42   return MCRegister();
43 }
44 
45 // Find a scratch register that we can use in the prologue. We avoid using
46 // callee-save registers since they may appear to be free when this is called
47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
48 // when this is called from emitPrologue.
49 static MCRegister findScratchNonCalleeSaveRegister(
50     MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51     const TargetRegisterClass &RC, bool Unused = false) {
52   // Mark callee saved registers as used so we will not choose them.
53   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54   for (unsigned i = 0; CSRegs[i]; ++i)
55     LiveUnits.addReg(CSRegs[i]);
56 
57   // We are looking for a register that can be used throughout the entire
58   // function, so any use is unacceptable.
59   if (Unused)
60     return findUnusedRegister(MRI, LiveUnits, RC);
61 
62   for (MCRegister Reg : RC) {
63     if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64       return Reg;
65   }
66 
67   return MCRegister();
68 }
69 
70 /// Query target location for spilling SGPRs
71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs
72 static void getVGPRSpillLaneOrTempRegister(
73     MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74     const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75     bool IncludeScratchCopy = true) {
76   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78 
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81   unsigned Size = TRI->getSpillSize(RC);
82   Align Alignment = TRI->getSpillAlign(RC);
83 
84   // We need to save and restore the given SGPR.
85 
86   Register ScratchSGPR;
87   // 1: Try to save the given register into an unused scratch SGPR. The
88   // LiveUnits should have all the callee saved registers marked as used. For
89   // certain cases we skip copy to scratch SGPR.
90   if (IncludeScratchCopy)
91     ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92 
93   if (!ScratchSGPR) {
94     int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
95                                          TargetStackID::SGPRSpill);
96 
97     if (TRI->spillSGPRToVGPR() &&
98         MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99                                          /*IsPrologEpilog=*/true)) {
100       // 2: There's no free lane to spill, and no free register to save the
101       // SGPR, so we're forced to take another VGPR to use for the spill.
102       MFI->addToPrologEpilogSGPRSpills(
103           SGPR, PrologEpilogSGPRSaveRestoreInfo(
104                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105 
106       LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107                  dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108                         << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109                         << '\n';);
110     } else {
111       // Remove dead <FI> index
112       MF.getFrameInfo().RemoveStackObject(FI);
113       // 3: If all else fails, spill the register to memory.
114       FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115       MFI->addToPrologEpilogSGPRSpills(
116           SGPR,
117           PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118       LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119                         << printReg(SGPR, TRI) << '\n');
120     }
121   } else {
122     MFI->addToPrologEpilogSGPRSpills(
123         SGPR, PrologEpilogSGPRSaveRestoreInfo(
124                   SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125     LiveUnits.addReg(ScratchSGPR);
126     LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127                       << printReg(ScratchSGPR, TRI) << '\n');
128   }
129 }
130 
131 // We need to specially emit stack operations here because a different frame
132 // register is used than in the rest of the function, as getFrameRegister would
133 // use.
134 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135                              const SIMachineFunctionInfo &FuncInfo,
136                              LiveRegUnits &LiveUnits, MachineFunction &MF,
137                              MachineBasicBlock &MBB,
138                              MachineBasicBlock::iterator I, const DebugLoc &DL,
139                              Register SpillReg, int FI, Register FrameReg,
140                              int64_t DwordOff = 0) {
141   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143 
144   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146   MachineMemOperand *MMO = MF.getMachineMemOperand(
147       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148       FrameInfo.getObjectAlign(FI));
149   LiveUnits.addReg(SpillReg);
150   bool IsKill = !MBB.isLiveIn(SpillReg);
151   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152                           DwordOff, MMO, nullptr, &LiveUnits);
153   if (IsKill)
154     LiveUnits.removeReg(SpillReg);
155 }
156 
157 static void buildEpilogRestore(const GCNSubtarget &ST,
158                                const SIRegisterInfo &TRI,
159                                const SIMachineFunctionInfo &FuncInfo,
160                                LiveRegUnits &LiveUnits, MachineFunction &MF,
161                                MachineBasicBlock &MBB,
162                                MachineBasicBlock::iterator I,
163                                const DebugLoc &DL, Register SpillReg, int FI,
164                                Register FrameReg, int64_t DwordOff = 0) {
165   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167 
168   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170   MachineMemOperand *MMO = MF.getMachineMemOperand(
171       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172       FrameInfo.getObjectAlign(FI));
173   TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174                           DwordOff, MMO, nullptr, &LiveUnits);
175 }
176 
177 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178                         const DebugLoc &DL, const SIInstrInfo *TII,
179                         Register TargetReg) {
180   MachineFunction *MF = MBB.getParent();
181   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184   Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185   Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186 
187   if (MFI->getGITPtrHigh() != 0xffffffff) {
188     BuildMI(MBB, I, DL, SMovB32, TargetHi)
189         .addImm(MFI->getGITPtrHigh())
190         .addReg(TargetReg, RegState::ImplicitDefine);
191   } else {
192     const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193     BuildMI(MBB, I, DL, GetPC64, TargetReg);
194   }
195   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196   MF->getRegInfo().addLiveIn(GitPtrLo);
197   MBB.addLiveIn(GitPtrLo);
198   BuildMI(MBB, I, DL, SMovB32, TargetLo)
199     .addReg(GitPtrLo);
200 }
201 
202 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203                           const SIMachineFunctionInfo *FuncInfo,
204                           MachineFunction &MF, MachineBasicBlock &MBB,
205                           MachineBasicBlock::iterator MBBI, bool IsProlog) {
206   if (LiveUnits.empty()) {
207     LiveUnits.init(TRI);
208     if (IsProlog) {
209       LiveUnits.addLiveIns(MBB);
210     } else {
211       // In epilog.
212       LiveUnits.addLiveOuts(MBB);
213       LiveUnits.stepBackward(*MBBI);
214     }
215   }
216 }
217 
218 namespace llvm {
219 
220 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221 // BP, etc. These spills are delayed until the current function's frame is
222 // finalized. For a given register, the builder uses the
223 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224 class PrologEpilogSGPRSpillBuilder {
225   MachineBasicBlock::iterator MI;
226   MachineBasicBlock &MBB;
227   MachineFunction &MF;
228   const GCNSubtarget &ST;
229   MachineFrameInfo &MFI;
230   SIMachineFunctionInfo *FuncInfo;
231   const SIInstrInfo *TII;
232   const SIRegisterInfo &TRI;
233   Register SuperReg;
234   const PrologEpilogSGPRSaveRestoreInfo SI;
235   LiveRegUnits &LiveUnits;
236   const DebugLoc &DL;
237   Register FrameReg;
238   ArrayRef<int16_t> SplitParts;
239   unsigned NumSubRegs;
240   unsigned EltSize = 4;
241 
242   void saveToMemory(const int FI) const {
243     MachineRegisterInfo &MRI = MF.getRegInfo();
244     assert(!MFI.isDeadObjectIndex(FI));
245 
246     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247 
248     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249         MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250     if (!TmpVGPR)
251       report_fatal_error("failed to find free scratch register");
252 
253     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254       Register SubReg = NumSubRegs == 1
255                             ? SuperReg
256                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258           .addReg(SubReg);
259 
260       buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261                        FI, FrameReg, DwordOff);
262       DwordOff += 4;
263     }
264   }
265 
266   void saveToVGPRLane(const int FI) const {
267     assert(!MFI.isDeadObjectIndex(FI));
268 
269     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271         FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
272     assert(Spill.size() == NumSubRegs);
273 
274     for (unsigned I = 0; I < NumSubRegs; ++I) {
275       Register SubReg = NumSubRegs == 1
276                             ? SuperReg
277                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279               Spill[I].VGPR)
280           .addReg(SubReg)
281           .addImm(Spill[I].Lane)
282           .addReg(Spill[I].VGPR, RegState::Undef);
283     }
284   }
285 
286   void copyToScratchSGPR(Register DstReg) const {
287     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288         .addReg(SuperReg)
289         .setMIFlag(MachineInstr::FrameSetup);
290   }
291 
292   void restoreFromMemory(const int FI) {
293     MachineRegisterInfo &MRI = MF.getRegInfo();
294 
295     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
296     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297         MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298     if (!TmpVGPR)
299       report_fatal_error("failed to find free scratch register");
300 
301     for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302       Register SubReg = NumSubRegs == 1
303                             ? SuperReg
304                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305 
306       buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307                          TmpVGPR, FI, FrameReg, DwordOff);
308       MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
309       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
310           .addReg(TmpVGPR, RegState::Kill);
311       DwordOff += 4;
312     }
313   }
314 
315   void restoreFromVGPRLane(const int FI) {
316     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
317     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
318         FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
319     assert(Spill.size() == NumSubRegs);
320 
321     for (unsigned I = 0; I < NumSubRegs; ++I) {
322       Register SubReg = NumSubRegs == 1
323                             ? SuperReg
324                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
325       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
326           .addReg(Spill[I].VGPR)
327           .addImm(Spill[I].Lane);
328     }
329   }
330 
331   void copyFromScratchSGPR(Register SrcReg) const {
332     BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
333         .addReg(SrcReg)
334         .setMIFlag(MachineInstr::FrameDestroy);
335   }
336 
337 public:
338   PrologEpilogSGPRSpillBuilder(Register Reg,
339                                const PrologEpilogSGPRSaveRestoreInfo SI,
340                                MachineBasicBlock &MBB,
341                                MachineBasicBlock::iterator MI,
342                                const DebugLoc &DL, const SIInstrInfo *TII,
343                                const SIRegisterInfo &TRI,
344                                LiveRegUnits &LiveUnits, Register FrameReg)
345       : MI(MI), MBB(MBB), MF(*MBB.getParent()),
346         ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
347         FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
348         SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
349         FrameReg(FrameReg) {
350     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
351     SplitParts = TRI.getRegSplitParts(RC, EltSize);
352     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
353 
354     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
355   }
356 
357   void save() {
358     switch (SI.getKind()) {
359     case SGPRSaveKind::SPILL_TO_MEM:
360       return saveToMemory(SI.getIndex());
361     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
362       return saveToVGPRLane(SI.getIndex());
363     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
364       return copyToScratchSGPR(SI.getReg());
365     }
366   }
367 
368   void restore() {
369     switch (SI.getKind()) {
370     case SGPRSaveKind::SPILL_TO_MEM:
371       return restoreFromMemory(SI.getIndex());
372     case SGPRSaveKind::SPILL_TO_VGPR_LANE:
373       return restoreFromVGPRLane(SI.getIndex());
374     case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
375       return copyFromScratchSGPR(SI.getReg());
376     }
377   }
378 };
379 
380 } // namespace llvm
381 
382 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
383 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
384     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
385     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
386   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
387   const SIInstrInfo *TII = ST.getInstrInfo();
388   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
389   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
390 
391   // We don't need this if we only have spills since there is no user facing
392   // scratch.
393 
394   // TODO: If we know we don't have flat instructions earlier, we can omit
395   // this from the input registers.
396   //
397   // TODO: We only need to know if we access scratch space through a flat
398   // pointer. Because we only detect if flat instructions are used at all,
399   // this will be used more often than necessary on VI.
400 
401   Register FlatScrInitLo;
402   Register FlatScrInitHi;
403 
404   if (ST.isAmdPalOS()) {
405     // Extract the scratch offset from the descriptor in the GIT
406     LiveRegUnits LiveUnits;
407     LiveUnits.init(*TRI);
408     LiveUnits.addLiveIns(MBB);
409 
410     // Find unused reg to load flat scratch init into
411     MachineRegisterInfo &MRI = MF.getRegInfo();
412     Register FlatScrInit = AMDGPU::NoRegister;
413     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
414     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
415     AllSGPR64s = AllSGPR64s.slice(
416         std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
417     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
418     for (MCPhysReg Reg : AllSGPR64s) {
419       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
420           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
421         FlatScrInit = Reg;
422         break;
423       }
424     }
425     assert(FlatScrInit && "Failed to find free register for scratch init");
426 
427     FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
428     FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
429 
430     buildGitPtr(MBB, I, DL, TII, FlatScrInit);
431 
432     // We now have the GIT ptr - now get the scratch descriptor from the entry
433     // at offset 0 (or offset 16 for a compute shader).
434     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
435     const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
436     auto *MMO = MF.getMachineMemOperand(
437         PtrInfo,
438         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
439             MachineMemOperand::MODereferenceable,
440         8, Align(4));
441     unsigned Offset =
442         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
443     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
444     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
445     BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
446         .addReg(FlatScrInit)
447         .addImm(EncodedOffset) // offset
448         .addImm(0)             // cpol
449         .addMemOperand(MMO);
450 
451     // Mask the offset in [47:0] of the descriptor
452     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
453     auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
454         .addReg(FlatScrInitHi)
455         .addImm(0xffff);
456     And->getOperand(3).setIsDead(); // Mark SCC as dead.
457   } else {
458     Register FlatScratchInitReg =
459         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
460     assert(FlatScratchInitReg);
461 
462     MachineRegisterInfo &MRI = MF.getRegInfo();
463     MRI.addLiveIn(FlatScratchInitReg);
464     MBB.addLiveIn(FlatScratchInitReg);
465 
466     FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
467     FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
468   }
469 
470   // Do a 64-bit pointer add.
471   if (ST.flatScratchIsPointer()) {
472     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
473       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
474         .addReg(FlatScrInitLo)
475         .addReg(ScratchWaveOffsetReg);
476       auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
477                           FlatScrInitHi)
478         .addReg(FlatScrInitHi)
479         .addImm(0);
480       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
481 
482       using namespace AMDGPU::Hwreg;
483       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
484           .addReg(FlatScrInitLo)
485           .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
486       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
487           .addReg(FlatScrInitHi)
488           .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
489       return;
490     }
491 
492     // For GFX9.
493     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
494       .addReg(FlatScrInitLo)
495       .addReg(ScratchWaveOffsetReg);
496     auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
497                         AMDGPU::FLAT_SCR_HI)
498       .addReg(FlatScrInitHi)
499       .addImm(0);
500     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
501 
502     return;
503   }
504 
505   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
506 
507   // Copy the size in bytes.
508   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
509     .addReg(FlatScrInitHi, RegState::Kill);
510 
511   // Add wave offset in bytes to private base offset.
512   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
513   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
514       .addReg(FlatScrInitLo)
515       .addReg(ScratchWaveOffsetReg);
516 
517   // Convert offset to 256-byte units.
518   auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
519                       AMDGPU::FLAT_SCR_HI)
520     .addReg(FlatScrInitLo, RegState::Kill)
521     .addImm(8);
522   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
523 }
524 
525 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
526 // memory. They should have been removed by now.
527 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
528   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
529        I != E; ++I) {
530     if (!MFI.isDeadObjectIndex(I))
531       return false;
532   }
533 
534   return true;
535 }
536 
537 // Shift down registers reserved for the scratch RSRC.
538 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
539     MachineFunction &MF) const {
540 
541   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
542   const SIInstrInfo *TII = ST.getInstrInfo();
543   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
544   MachineRegisterInfo &MRI = MF.getRegInfo();
545   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
546 
547   assert(MFI->isEntryFunction());
548 
549   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
550 
551   if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
552                           allStackObjectsAreDead(MF.getFrameInfo())))
553     return Register();
554 
555   if (ST.hasSGPRInitBug() ||
556       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
557     return ScratchRsrcReg;
558 
559   // We reserved the last registers for this. Shift it down to the end of those
560   // which were actually used.
561   //
562   // FIXME: It might be safer to use a pseudoregister before replacement.
563 
564   // FIXME: We should be able to eliminate unused input registers. We only
565   // cannot do this for the resources required for scratch access. For now we
566   // skip over user SGPRs and may leave unused holes.
567 
568   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
569   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
570   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
571 
572   // Skip the last N reserved elements because they should have already been
573   // reserved for VCC etc.
574   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
575   for (MCPhysReg Reg : AllSGPR128s) {
576     // Pick the first unallocated one. Make sure we don't clobber the other
577     // reserved input we needed. Also for PAL, make sure we don't clobber
578     // the GIT pointer passed in SGPR0 or SGPR8.
579     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
580         (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
581       MRI.replaceRegWith(ScratchRsrcReg, Reg);
582       MFI->setScratchRSrcReg(Reg);
583       MRI.reserveReg(Reg, TRI);
584       return Reg;
585     }
586   }
587 
588   return ScratchRsrcReg;
589 }
590 
591 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
592   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
593 }
594 
595 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
596                                                 MachineBasicBlock &MBB) const {
597   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
598 
599   // FIXME: If we only have SGPR spills, we won't actually be using scratch
600   // memory since these spill to VGPRs. We should be cleaning up these unused
601   // SGPR spill frame indices somewhere.
602 
603   // FIXME: We still have implicit uses on SGPR spill instructions in case they
604   // need to spill to vector memory. It's likely that will not happen, but at
605   // this point it appears we need the setup. This part of the prolog should be
606   // emitted after frame indices are eliminated.
607 
608   // FIXME: Remove all of the isPhysRegUsed checks
609 
610   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
611   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
612   const SIInstrInfo *TII = ST.getInstrInfo();
613   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
614   MachineRegisterInfo &MRI = MF.getRegInfo();
615   const Function &F = MF.getFunction();
616   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
617 
618   assert(MFI->isEntryFunction());
619 
620   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
621       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
622 
623   // We need to do the replacement of the private segment buffer register even
624   // if there are no stack objects. There could be stores to undef or a
625   // constant without an associated object.
626   //
627   // This will return `Register()` in cases where there are no actual
628   // uses of the SRSRC.
629   Register ScratchRsrcReg;
630   if (!ST.enableFlatScratch())
631     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
632 
633   // Make the selected register live throughout the function.
634   if (ScratchRsrcReg) {
635     for (MachineBasicBlock &OtherBB : MF) {
636       if (&OtherBB != &MBB) {
637         OtherBB.addLiveIn(ScratchRsrcReg);
638       }
639     }
640   }
641 
642   // Now that we have fixed the reserved SRSRC we need to locate the
643   // (potentially) preloaded SRSRC.
644   Register PreloadedScratchRsrcReg;
645   if (ST.isAmdHsaOrMesa(F)) {
646     PreloadedScratchRsrcReg =
647         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
648     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
649       // We added live-ins during argument lowering, but since they were not
650       // used they were deleted. We're adding the uses now, so add them back.
651       MRI.addLiveIn(PreloadedScratchRsrcReg);
652       MBB.addLiveIn(PreloadedScratchRsrcReg);
653     }
654   }
655 
656   // Debug location must be unknown since the first debug location is used to
657   // determine the end of the prologue.
658   DebugLoc DL;
659   MachineBasicBlock::iterator I = MBB.begin();
660 
661   // We found the SRSRC first because it needs four registers and has an
662   // alignment requirement. If the SRSRC that we found is clobbering with
663   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
664   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
665   // wave offset to a free SGPR.
666   Register ScratchWaveOffsetReg;
667   if (PreloadedScratchWaveOffsetReg &&
668       TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
669     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
670     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
671     AllSGPRs = AllSGPRs.slice(
672         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
673     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
674     for (MCPhysReg Reg : AllSGPRs) {
675       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
676           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
677         ScratchWaveOffsetReg = Reg;
678         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
679             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
680         break;
681       }
682     }
683 
684     // FIXME: We can spill incoming arguments and restore at the end of the
685     // prolog.
686     if (!ScratchWaveOffsetReg)
687       report_fatal_error(
688           "could not find temporary scratch offset register in prolog");
689   } else {
690     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
691   }
692   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
693 
694   unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
695   if (!mayReserveScratchForCWSR(MF)) {
696     if (hasFP(MF)) {
697       Register FPReg = MFI->getFrameOffsetReg();
698       assert(FPReg != AMDGPU::FP_REG);
699       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
700     }
701 
702     if (requiresStackPointerReference(MF)) {
703       Register SPReg = MFI->getStackPtrOffsetReg();
704       assert(SPReg != AMDGPU::SP_REG);
705       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
706     }
707   } else {
708     // We need to check if we're on a compute queue - if we are, then the CWSR
709     // trap handler may need to store some VGPRs on the stack. The first VGPR
710     // block is saved separately, so we only need to allocate space for any
711     // additional VGPR blocks used. For now, we will make sure there's enough
712     // room for the theoretical maximum number of VGPRs that can be allocated.
713     // FIXME: Figure out if the shader uses fewer VGPRs in practice.
714     assert(hasFP(MF));
715     Register FPReg = MFI->getFrameOffsetReg();
716     assert(FPReg != AMDGPU::FP_REG);
717     unsigned VGPRSize = llvm::alignTo(
718         (ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
719          AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
720                                               MFI->getDynamicVGPRBlockSize())) *
721             4,
722         FrameInfo.getMaxAlign());
723     MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
724 
725     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
726         .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
727             AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
728     // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
729     // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
730     // SCC, so we need to check for 0 manually.
731     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
732     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
733     if (requiresStackPointerReference(MF)) {
734       Register SPReg = MFI->getStackPtrOffsetReg();
735       assert(SPReg != AMDGPU::SP_REG);
736 
737       // If at least one of the constants can be inlined, then we can use
738       // s_cselect. Otherwise, use a mov and cmovk.
739       if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) ||
740           AMDGPU::isInlinableLiteral32(Offset + VGPRSize,
741                                        ST.hasInv2PiInlineImm())) {
742         BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg)
743             .addImm(Offset + VGPRSize)
744             .addImm(Offset);
745       } else {
746         BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
747         BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg)
748             .addImm(Offset + VGPRSize);
749       }
750     }
751   }
752 
753   bool NeedsFlatScratchInit =
754       MFI->getUserSGPRInfo().hasFlatScratchInit() &&
755       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
756        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
757 
758   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
759       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
760     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
761     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
762   }
763 
764   if (NeedsFlatScratchInit) {
765     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
766   }
767 
768   if (ScratchRsrcReg) {
769     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
770                                          PreloadedScratchRsrcReg,
771                                          ScratchRsrcReg, ScratchWaveOffsetReg);
772   }
773 }
774 
775 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
776 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
777     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
778     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
779     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
780 
781   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
782   const SIInstrInfo *TII = ST.getInstrInfo();
783   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
784   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
785   const Function &Fn = MF.getFunction();
786 
787   if (ST.isAmdPalOS()) {
788     // The pointer to the GIT is formed from the offset passed in and either
789     // the amdgpu-git-ptr-high function attribute or the top part of the PC
790     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
791     Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
792 
793     buildGitPtr(MBB, I, DL, TII, Rsrc01);
794 
795     // We now have the GIT ptr - now get the scratch descriptor from the entry
796     // at offset 0 (or offset 16 for a compute shader).
797     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
798     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
799     auto *MMO = MF.getMachineMemOperand(
800         PtrInfo,
801         MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
802             MachineMemOperand::MODereferenceable,
803         16, Align(4));
804     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
805     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
806     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
807     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
808       .addReg(Rsrc01)
809       .addImm(EncodedOffset) // offset
810       .addImm(0) // cpol
811       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
812       .addMemOperand(MMO);
813 
814     // The driver will always set the SRD for wave 64 (bits 118:117 of
815     // descriptor / bits 22:21 of third sub-reg will be 0b11)
816     // If the shader is actually wave32 we have to modify the const_index_stride
817     // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
818     // reason the driver does this is that there can be cases where it presents
819     // 2 shaders with different wave size (e.g. VsFs).
820     // TODO: convert to using SCRATCH instructions or multiple SRD buffers
821     if (ST.isWave32()) {
822       const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
823       BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
824           .addImm(21)
825           .addReg(Rsrc03);
826     }
827   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
828     assert(!ST.isAmdHsaOrMesa(Fn));
829     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
830 
831     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
832     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
833 
834     // Use relocations to get the pointer, and setup the other bits manually.
835     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
836 
837     if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
838       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
839 
840       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
841         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
842 
843         BuildMI(MBB, I, DL, Mov64, Rsrc01)
844           .addReg(MFI->getImplicitBufferPtrUserSGPR())
845           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
846       } else {
847         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
848 
849         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
850         auto *MMO = MF.getMachineMemOperand(
851             PtrInfo,
852             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
853                 MachineMemOperand::MODereferenceable,
854             8, Align(4));
855         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
856           .addReg(MFI->getImplicitBufferPtrUserSGPR())
857           .addImm(0) // offset
858           .addImm(0) // cpol
859           .addMemOperand(MMO)
860           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
861 
862         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
863         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
864       }
865     } else {
866       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
867       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
868 
869       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
870         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
871         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
872 
873       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
874         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
875         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
876     }
877 
878     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
879         .addImm(Lo_32(Rsrc23))
880         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
881 
882     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
883         .addImm(Hi_32(Rsrc23))
884         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
885   } else if (ST.isAmdHsaOrMesa(Fn)) {
886     assert(PreloadedScratchRsrcReg);
887 
888     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
889       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
890           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
891     }
892   }
893 
894   // Add the scratch wave offset into the scratch RSRC.
895   //
896   // We only want to update the first 48 bits, which is the base address
897   // pointer, without touching the adjacent 16 bits of flags. We know this add
898   // cannot carry-out from bit 47, otherwise the scratch allocation would be
899   // impossible to fit in the 48-bit global address space.
900   //
901   // TODO: Evaluate if it is better to just construct an SRD using the flat
902   // scratch init and some constants rather than update the one we are passed.
903   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
904   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
905 
906   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
907   // the kernel body via inreg arguments.
908   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
909       .addReg(ScratchRsrcSub0)
910       .addReg(ScratchWaveOffsetReg)
911       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
912   auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
913       .addReg(ScratchRsrcSub1)
914       .addImm(0)
915       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
916   Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
917 }
918 
919 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
920   switch (ID) {
921   case TargetStackID::Default:
922   case TargetStackID::NoAlloc:
923   case TargetStackID::SGPRSpill:
924     return true;
925   case TargetStackID::ScalableVector:
926   case TargetStackID::WasmLocal:
927     return false;
928   }
929   llvm_unreachable("Invalid TargetStackID::Value");
930 }
931 
932 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
933 // Otherwise, activate all lanes. It returns the saved exec.
934 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
935                                      MachineFunction &MF,
936                                      MachineBasicBlock &MBB,
937                                      MachineBasicBlock::iterator MBBI,
938                                      const DebugLoc &DL, bool IsProlog,
939                                      bool EnableInactiveLanes) {
940   Register ScratchExecCopy;
941   MachineRegisterInfo &MRI = MF.getRegInfo();
942   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
943   const SIInstrInfo *TII = ST.getInstrInfo();
944   const SIRegisterInfo &TRI = TII->getRegisterInfo();
945   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
946 
947   initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
948 
949   ScratchExecCopy = findScratchNonCalleeSaveRegister(
950       MRI, LiveUnits, *TRI.getWaveMaskRegClass());
951   if (!ScratchExecCopy)
952     report_fatal_error("failed to find free scratch register");
953 
954   LiveUnits.addReg(ScratchExecCopy);
955 
956   const unsigned SaveExecOpc =
957       ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
958                                            : AMDGPU::S_OR_SAVEEXEC_B32)
959                     : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
960                                            : AMDGPU::S_OR_SAVEEXEC_B64);
961   auto SaveExec =
962       BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
963   SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
964 
965   return ScratchExecCopy;
966 }
967 
968 void SIFrameLowering::emitCSRSpillStores(
969     MachineFunction &MF, MachineBasicBlock &MBB,
970     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
971     Register FrameReg, Register FramePtrRegScratchCopy) const {
972   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
973   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
974   const SIInstrInfo *TII = ST.getInstrInfo();
975   const SIRegisterInfo &TRI = TII->getRegisterInfo();
976 
977   // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
978   // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
979   // might end up flipping the EXEC bits twice.
980   Register ScratchExecCopy;
981   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
982   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
983   if (!WWMScratchRegs.empty())
984     ScratchExecCopy =
985         buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
986                              /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
987 
988   auto StoreWWMRegisters =
989       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
990         for (const auto &Reg : WWMRegs) {
991           Register VGPR = Reg.first;
992           int FI = Reg.second;
993           buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
994                            VGPR, FI, FrameReg);
995         }
996       };
997 
998   StoreWWMRegisters(WWMScratchRegs);
999   if (!WWMCalleeSavedRegs.empty()) {
1000     if (ScratchExecCopy) {
1001       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1003     } else {
1004       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1005                                              /*IsProlog*/ true,
1006                                              /*EnableInactiveLanes*/ false);
1007     }
1008   }
1009 
1010   StoreWWMRegisters(WWMCalleeSavedRegs);
1011   if (ScratchExecCopy) {
1012     // FIXME: Split block and make terminator.
1013     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1014     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1015         .addReg(ScratchExecCopy, RegState::Kill);
1016     LiveUnits.addReg(ScratchExecCopy);
1017   }
1018 
1019   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1020 
1021   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1022     // Special handle FP spill:
1023     // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1024     // Otherwise, FP has been moved to a temporary register and spill it
1025     // instead.
1026     Register Reg =
1027         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1028     if (!Reg)
1029       continue;
1030 
1031     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1032                                     LiveUnits, FrameReg);
1033     SB.save();
1034   }
1035 
1036   // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1037   // such scratch registers live throughout the function.
1038   SmallVector<Register, 1> ScratchSGPRs;
1039   FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
1040   if (!ScratchSGPRs.empty()) {
1041     for (MachineBasicBlock &MBB : MF) {
1042       for (MCPhysReg Reg : ScratchSGPRs)
1043         MBB.addLiveIn(Reg);
1044 
1045       MBB.sortUniqueLiveIns();
1046     }
1047     if (!LiveUnits.empty()) {
1048       for (MCPhysReg Reg : ScratchSGPRs)
1049         LiveUnits.addReg(Reg);
1050     }
1051   }
1052 }
1053 
1054 void SIFrameLowering::emitCSRSpillRestores(
1055     MachineFunction &MF, MachineBasicBlock &MBB,
1056     MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1057     Register FrameReg, Register FramePtrRegScratchCopy) const {
1058   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1059   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1060   const SIInstrInfo *TII = ST.getInstrInfo();
1061   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1062   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1063 
1064   for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1065     // Special handle FP restore:
1066     // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1067     // the FP value to a temporary register. The frame pointer should be
1068     // overwritten only at the end when all other spills are restored from
1069     // current frame.
1070     Register Reg =
1071         Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1072     if (!Reg)
1073       continue;
1074 
1075     PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1076                                     LiveUnits, FrameReg);
1077     SB.restore();
1078   }
1079 
1080   // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1081   // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1082   // this, we might end up flipping the EXEC bits twice.
1083   Register ScratchExecCopy;
1084   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1085   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1086   if (!WWMScratchRegs.empty())
1087     ScratchExecCopy =
1088         buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1089                              /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1090 
1091   auto RestoreWWMRegisters =
1092       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1093         for (const auto &Reg : WWMRegs) {
1094           Register VGPR = Reg.first;
1095           int FI = Reg.second;
1096           buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1097                              VGPR, FI, FrameReg);
1098         }
1099       };
1100 
1101   RestoreWWMRegisters(WWMScratchRegs);
1102   if (!WWMCalleeSavedRegs.empty()) {
1103     if (ScratchExecCopy) {
1104       unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1105       BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1106     } else {
1107       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1108                                              /*IsProlog*/ false,
1109                                              /*EnableInactiveLanes*/ false);
1110     }
1111   }
1112 
1113   RestoreWWMRegisters(WWMCalleeSavedRegs);
1114   if (ScratchExecCopy) {
1115     // FIXME: Split block and make terminator.
1116     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1117     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1118         .addReg(ScratchExecCopy, RegState::Kill);
1119   }
1120 }
1121 
1122 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1123                                    MachineBasicBlock &MBB) const {
1124   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1125   if (FuncInfo->isEntryFunction()) {
1126     emitEntryFunctionPrologue(MF, MBB);
1127     return;
1128   }
1129 
1130   MachineFrameInfo &MFI = MF.getFrameInfo();
1131   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132   const SIInstrInfo *TII = ST.getInstrInfo();
1133   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1134   MachineRegisterInfo &MRI = MF.getRegInfo();
1135 
1136   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1137   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1138   Register BasePtrReg =
1139       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1140   LiveRegUnits LiveUnits;
1141 
1142   MachineBasicBlock::iterator MBBI = MBB.begin();
1143   // DebugLoc must be unknown since the first instruction with DebugLoc is used
1144   // to determine the end of the prologue.
1145   DebugLoc DL;
1146 
1147   if (FuncInfo->isChainFunction()) {
1148     // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1149     // are free to set one up if they need it.
1150     bool UseSP = requiresStackPointerReference(MF);
1151     if (UseSP) {
1152       assert(StackPtrReg != AMDGPU::SP_REG);
1153 
1154       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1155           .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1156     }
1157   }
1158 
1159   bool HasFP = false;
1160   bool HasBP = false;
1161   uint32_t NumBytes = MFI.getStackSize();
1162   uint32_t RoundedSize = NumBytes;
1163 
1164   if (TRI.hasStackRealignment(MF))
1165     HasFP = true;
1166 
1167   Register FramePtrRegScratchCopy;
1168   if (!HasFP && !hasFP(MF)) {
1169     // Emit the CSR spill stores with SP base register.
1170     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1171                        FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1172                        FramePtrRegScratchCopy);
1173   } else {
1174     // CSR spill stores will use FP as base register.
1175     Register SGPRForFPSaveRestoreCopy =
1176         FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1177 
1178     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1179     if (SGPRForFPSaveRestoreCopy) {
1180       // Copy FP to the scratch register now and emit the CFI entry. It avoids
1181       // the extra FP copy needed in the other two cases when FP is spilled to
1182       // memory or to a VGPR lane.
1183       PrologEpilogSGPRSpillBuilder SB(
1184           FramePtrReg,
1185           FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1186           DL, TII, TRI, LiveUnits, FramePtrReg);
1187       SB.save();
1188       LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1189     } else {
1190       // Copy FP into a new scratch register so that its previous value can be
1191       // spilled after setting up the new frame.
1192       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1193           MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1194       if (!FramePtrRegScratchCopy)
1195         report_fatal_error("failed to find free scratch register");
1196 
1197       LiveUnits.addReg(FramePtrRegScratchCopy);
1198       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1199           .addReg(FramePtrReg);
1200     }
1201   }
1202 
1203   if (HasFP) {
1204     const unsigned Alignment = MFI.getMaxAlign().value();
1205 
1206     RoundedSize += Alignment;
1207     if (LiveUnits.empty()) {
1208       LiveUnits.init(TRI);
1209       LiveUnits.addLiveIns(MBB);
1210     }
1211 
1212     // s_add_i32 s33, s32, NumBytes
1213     // s_and_b32 s33, s33, 0b111...0000
1214     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1215         .addReg(StackPtrReg)
1216         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1217         .setMIFlag(MachineInstr::FrameSetup);
1218     auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1219         .addReg(FramePtrReg, RegState::Kill)
1220         .addImm(-Alignment * getScratchScaleFactor(ST))
1221         .setMIFlag(MachineInstr::FrameSetup);
1222     And->getOperand(3).setIsDead(); // Mark SCC as dead.
1223     FuncInfo->setIsStackRealigned(true);
1224   } else if ((HasFP = hasFP(MF))) {
1225     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1226         .addReg(StackPtrReg)
1227         .setMIFlag(MachineInstr::FrameSetup);
1228   }
1229 
1230   // If FP is used, emit the CSR spills with FP base register.
1231   if (HasFP) {
1232     emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1233                        FramePtrRegScratchCopy);
1234     if (FramePtrRegScratchCopy)
1235       LiveUnits.removeReg(FramePtrRegScratchCopy);
1236   }
1237 
1238   // If we need a base pointer, set it up here. It's whatever the value of
1239   // the stack pointer is at this point. Any variable size objects will be
1240   // allocated after this, so we can still use the base pointer to reference
1241   // the incoming arguments.
1242   if ((HasBP = TRI.hasBasePointer(MF))) {
1243     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1244         .addReg(StackPtrReg)
1245         .setMIFlag(MachineInstr::FrameSetup);
1246   }
1247 
1248   if (HasFP && RoundedSize != 0) {
1249     auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1250         .addReg(StackPtrReg)
1251         .addImm(RoundedSize * getScratchScaleFactor(ST))
1252         .setMIFlag(MachineInstr::FrameSetup);
1253     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1254   }
1255 
1256   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1257   (void)FPSaved;
1258   assert((!HasFP || FPSaved) &&
1259          "Needed to save FP but didn't save it anywhere");
1260 
1261   // If we allow spilling to AGPRs we may have saved FP but then spill
1262   // everything into AGPRs instead of the stack.
1263   assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1264          "Saved FP but didn't need it");
1265 
1266   bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1267   (void)BPSaved;
1268   assert((!HasBP || BPSaved) &&
1269          "Needed to save BP but didn't save it anywhere");
1270 
1271   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1272 }
1273 
1274 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1275                                    MachineBasicBlock &MBB) const {
1276   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1277   if (FuncInfo->isEntryFunction())
1278     return;
1279 
1280   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1281   const SIInstrInfo *TII = ST.getInstrInfo();
1282   const SIRegisterInfo &TRI = TII->getRegisterInfo();
1283   MachineRegisterInfo &MRI = MF.getRegInfo();
1284   LiveRegUnits LiveUnits;
1285   // Get the insert location for the epilogue. If there were no terminators in
1286   // the block, get the last instruction.
1287   MachineBasicBlock::iterator MBBI = MBB.end();
1288   DebugLoc DL;
1289   if (!MBB.empty()) {
1290     MBBI = MBB.getLastNonDebugInstr();
1291     if (MBBI != MBB.end())
1292       DL = MBBI->getDebugLoc();
1293 
1294     MBBI = MBB.getFirstTerminator();
1295   }
1296 
1297   const MachineFrameInfo &MFI = MF.getFrameInfo();
1298   uint32_t NumBytes = MFI.getStackSize();
1299   uint32_t RoundedSize = FuncInfo->isStackRealigned()
1300                              ? NumBytes + MFI.getMaxAlign().value()
1301                              : NumBytes;
1302   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1303   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1304   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1305 
1306   if (RoundedSize != 0) {
1307     if (TRI.hasBasePointer(MF)) {
1308       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1309           .addReg(TRI.getBaseRegister())
1310           .setMIFlag(MachineInstr::FrameDestroy);
1311     } else if (hasFP(MF)) {
1312       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1313           .addReg(FramePtrReg)
1314           .setMIFlag(MachineInstr::FrameDestroy);
1315     }
1316   }
1317 
1318   Register FramePtrRegScratchCopy;
1319   Register SGPRForFPSaveRestoreCopy =
1320       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1321   if (FPSaved) {
1322     // CSR spill restores should use FP as base register. If
1323     // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1324     // into a new scratch register and copy to FP later when other registers are
1325     // restored from the current stack frame.
1326     initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1327     if (SGPRForFPSaveRestoreCopy) {
1328       LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1329     } else {
1330       FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1331           MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1332       if (!FramePtrRegScratchCopy)
1333         report_fatal_error("failed to find free scratch register");
1334 
1335       LiveUnits.addReg(FramePtrRegScratchCopy);
1336     }
1337 
1338     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1339                          FramePtrRegScratchCopy);
1340   }
1341 
1342   if (FPSaved) {
1343     // Insert the copy to restore FP.
1344     Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1345                                                : FramePtrRegScratchCopy;
1346     MachineInstrBuilder MIB =
1347         BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1348             .addReg(SrcReg);
1349     if (SGPRForFPSaveRestoreCopy)
1350       MIB.setMIFlag(MachineInstr::FrameDestroy);
1351   } else {
1352     // Insert the CSR spill restores with SP as the base register.
1353     emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
1354                          FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1355                          FramePtrRegScratchCopy);
1356   }
1357 }
1358 
1359 #ifndef NDEBUG
1360 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1361   const MachineFrameInfo &MFI = MF.getFrameInfo();
1362   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1363   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1364        I != E; ++I) {
1365     if (!MFI.isDeadObjectIndex(I) &&
1366         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1367         !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1368       return false;
1369     }
1370   }
1371 
1372   return true;
1373 }
1374 #endif
1375 
1376 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1377                                                     int FI,
1378                                                     Register &FrameReg) const {
1379   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1380 
1381   FrameReg = RI->getFrameRegister(MF);
1382   return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1383 }
1384 
1385 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1386   MachineFunction &MF,
1387   RegScavenger *RS) const {
1388   MachineFrameInfo &MFI = MF.getFrameInfo();
1389 
1390   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1391   const SIInstrInfo *TII = ST.getInstrInfo();
1392   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1393   MachineRegisterInfo &MRI = MF.getRegInfo();
1394   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1395 
1396   const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1397                                && EnableSpillVGPRToAGPR;
1398 
1399   if (SpillVGPRToAGPR) {
1400     // To track the spill frame indices handled in this pass.
1401     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1402     BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1403 
1404     bool SeenDbgInstr = false;
1405 
1406     for (MachineBasicBlock &MBB : MF) {
1407       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1408         int FrameIndex;
1409         if (MI.isDebugInstr())
1410           SeenDbgInstr = true;
1411 
1412         if (TII->isVGPRSpill(MI)) {
1413           // Try to eliminate stack used by VGPR spills before frame
1414           // finalization.
1415           unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1416                                                      AMDGPU::OpName::vaddr);
1417           int FI = MI.getOperand(FIOp).getIndex();
1418           Register VReg =
1419             TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1420           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1421                                                 TRI->isAGPR(MRI, VReg))) {
1422             assert(RS != nullptr);
1423             RS->enterBasicBlockEnd(MBB);
1424             RS->backward(std::next(MI.getIterator()));
1425             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1426             SpillFIs.set(FI);
1427             continue;
1428           }
1429         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1430                    TII->isLoadFromStackSlot(MI, FrameIndex))
1431           if (!MFI.isFixedObjectIndex(FrameIndex))
1432             NonVGPRSpillFIs.set(FrameIndex);
1433       }
1434     }
1435 
1436     // Stack slot coloring may assign different objects to the same stack slot.
1437     // If not, then the VGPR to AGPR spill slot is dead.
1438     for (unsigned FI : SpillFIs.set_bits())
1439       if (!NonVGPRSpillFIs.test(FI))
1440         FuncInfo->setVGPRToAGPRSpillDead(FI);
1441 
1442     for (MachineBasicBlock &MBB : MF) {
1443       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1444         MBB.addLiveIn(Reg);
1445 
1446       for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1447         MBB.addLiveIn(Reg);
1448 
1449       MBB.sortUniqueLiveIns();
1450 
1451       if (!SpillFIs.empty() && SeenDbgInstr) {
1452         // FIXME: The dead frame indices are replaced with a null register from
1453         // the debug value instructions. We should instead, update it with the
1454         // correct register value. But not sure the register value alone is
1455         for (MachineInstr &MI : MBB) {
1456           if (MI.isDebugValue()) {
1457             uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
1458             if (MI.getOperand(StackOperandIdx).isFI() &&
1459                 !MFI.isFixedObjectIndex(
1460                     MI.getOperand(StackOperandIdx).getIndex()) &&
1461                 SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
1462               MI.getOperand(StackOperandIdx)
1463                   .ChangeToRegister(Register(), false /*isDef*/);
1464             }
1465           }
1466         }
1467       }
1468     }
1469   }
1470 
1471   // At this point we've already allocated all spilled SGPRs to VGPRs if we
1472   // can. Any remaining SGPR spills will go to memory, so move them back to the
1473   // default stack.
1474   bool HaveSGPRToVMemSpill =
1475       FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1476   assert(allSGPRSpillsAreDead(MF) &&
1477          "SGPR spill should have been removed in SILowerSGPRSpills");
1478 
1479   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1480   // but currently hasNonSpillStackObjects is set only from source
1481   // allocas. Stack temps produced from legalization are not counted currently.
1482   if (!allStackObjectsAreDead(MFI)) {
1483     assert(RS && "RegScavenger required if spilling");
1484 
1485     // Add an emergency spill slot
1486     RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1487 
1488     // If we are spilling SGPRs to memory with a large frame, we may need a
1489     // second VGPR emergency frame index.
1490     if (HaveSGPRToVMemSpill &&
1491         allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1492       RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(4, Align(4)));
1493     }
1494   }
1495 }
1496 
1497 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1498     MachineFunction &MF, RegScavenger *RS) const {
1499   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1500   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1501   MachineRegisterInfo &MRI = MF.getRegInfo();
1502   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1503 
1504   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1505     // On gfx908, we had initially reserved highest available VGPR for AGPR
1506     // copy. Now since we are done with RA, check if there exist an unused VGPR
1507     // which is lower than the eariler reserved VGPR before RA. If one exist,
1508     // use it for AGPR copy instead of one reserved before RA.
1509     Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1510     Register UnusedLowVGPR =
1511         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1512     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1513                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1514       // Reserve this newly identified VGPR (for AGPR copy)
1515       // reserved registers should already be frozen at this point
1516       // so we can avoid calling MRI.freezeReservedRegs and just use
1517       // MRI.reserveReg
1518       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1519       MRI.reserveReg(UnusedLowVGPR, TRI);
1520     }
1521   }
1522   // We initally reserved the highest available SGPR pair for long branches
1523   // now, after RA, we shift down to a lower unused one if one exists
1524   Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1525   Register UnusedLowSGPR =
1526       TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1527   // If LongBranchReservedReg is null then we didn't find a long branch
1528   // and never reserved a register to begin with so there is nothing to
1529   // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1530   // register to use so just keep the original one we set.
1531   if (LongBranchReservedReg && UnusedLowSGPR) {
1532     FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1533     MRI.reserveReg(UnusedLowSGPR, TRI);
1534   }
1535 }
1536 
1537 // The special SGPR spills like the one needed for FP, BP or any reserved
1538 // registers delayed until frame lowering.
1539 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1540     MachineFunction &MF, BitVector &SavedVGPRs,
1541     bool NeedExecCopyReservedReg) const {
1542   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1543   MachineRegisterInfo &MRI = MF.getRegInfo();
1544   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1545   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1546   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1547   LiveRegUnits LiveUnits;
1548   LiveUnits.init(*TRI);
1549   // Initially mark callee saved registers as used so we will not choose them
1550   // while looking for scratch SGPRs.
1551   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1552   for (unsigned I = 0; CSRegs[I]; ++I)
1553     LiveUnits.addReg(CSRegs[I]);
1554 
1555   const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1556 
1557   Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1558   if (NeedExecCopyReservedReg ||
1559       (ReservedRegForExecCopy &&
1560        MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1561     MRI.reserveReg(ReservedRegForExecCopy, TRI);
1562     Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1563     if (UnusedScratchReg) {
1564       // If found any unused scratch SGPR, reserve the register itself for Exec
1565       // copy and there is no need for any spills in that case.
1566       MFI->setSGPRForEXECCopy(UnusedScratchReg);
1567       MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1568       LiveUnits.addReg(UnusedScratchReg);
1569     } else {
1570       // Needs spill.
1571       assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1572              "Re-reserving spill slot for EXEC copy register");
1573       getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
1574                                      /*IncludeScratchCopy=*/false);
1575     }
1576   } else if (ReservedRegForExecCopy) {
1577     // Reset it at this point. There are no whole-wave copies and spills
1578     // encountered.
1579     MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1580   }
1581 
1582   // hasFP only knows about stack objects that already exist. We're now
1583   // determining the stack slots that will be created, so we have to predict
1584   // them. Stack objects force FP usage with calls.
1585   //
1586   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1587   // don't want to report it here.
1588   //
1589   // FIXME: Is this really hasReservedCallFrame?
1590   const bool WillHaveFP =
1591       FrameInfo.hasCalls() &&
1592       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1593 
1594   if (WillHaveFP || hasFP(MF)) {
1595     Register FramePtrReg = MFI->getFrameOffsetReg();
1596     assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1597            "Re-reserving spill slot for FP");
1598     getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1599   }
1600 
1601   if (TRI->hasBasePointer(MF)) {
1602     Register BasePtrReg = TRI->getBaseRegister();
1603     assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1604            "Re-reserving spill slot for BP");
1605     getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1606   }
1607 }
1608 
1609 // Only report VGPRs to generic code.
1610 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1611                                            BitVector &SavedVGPRs,
1612                                            RegScavenger *RS) const {
1613   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1614 
1615   // If this is a function with the amdgpu_cs_chain[_preserve] calling
1616   // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1617   // we don't need to save and restore anything.
1618   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1619     return;
1620 
1621   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1622 
1623   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1624   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1625   const SIInstrInfo *TII = ST.getInstrInfo();
1626   bool NeedExecCopyReservedReg = false;
1627 
1628   MachineInstr *ReturnMI = nullptr;
1629   for (MachineBasicBlock &MBB : MF) {
1630     for (MachineInstr &MI : MBB) {
1631       // TODO: Walking through all MBBs here would be a bad heuristic. Better
1632       // handle them elsewhere.
1633       if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1634         NeedExecCopyReservedReg = true;
1635       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1636                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1637                (MFI->isChainFunction() &&
1638                 TII->isChainCallOpcode(MI.getOpcode()))) {
1639         // We expect all return to be the same size.
1640         assert(!ReturnMI ||
1641                (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1642                 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1643         ReturnMI = &MI;
1644       }
1645     }
1646   }
1647 
1648   SmallVector<Register> SortedWWMVGPRs;
1649   for (Register Reg : MFI->getWWMReservedRegs()) {
1650     // The shift-back is needed only for the VGPRs used for SGPR spills and they
1651     // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1652     // reserved registers.
1653     const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1654     if (TRI->getRegSizeInBits(*RC) != 32)
1655       continue;
1656     SortedWWMVGPRs.push_back(Reg);
1657   }
1658 
1659   sort(SortedWWMVGPRs, std::greater<Register>());
1660   MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs);
1661 
1662   if (MFI->isEntryFunction())
1663     return;
1664 
1665   // Remove any VGPRs used in the return value because these do not need to be saved.
1666   // This prevents CSR restore from clobbering return VGPRs.
1667   if (ReturnMI) {
1668     for (auto &Op : ReturnMI->operands()) {
1669       if (Op.isReg())
1670         SavedVGPRs.reset(Op.getReg());
1671     }
1672   }
1673 
1674   // Create the stack objects for WWM registers now.
1675   for (Register Reg : MFI->getWWMReservedRegs()) {
1676     const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1677     MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1678                           TRI->getSpillAlign(*RC));
1679   }
1680 
1681   // Ignore the SGPRs the default implementation found.
1682   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1683 
1684   // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1685   // In gfx908 there was do AGPR loads and stores and thus spilling also
1686   // require a temporary VGPR.
1687   if (!ST.hasGFX90AInsts())
1688     SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1689 
1690   determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1691 
1692   // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1693   // allow the default insertion to handle them.
1694   for (auto &Reg : MFI->getWWMSpills())
1695     SavedVGPRs.reset(Reg.first);
1696 }
1697 
1698 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1699                                                BitVector &SavedRegs,
1700                                                RegScavenger *RS) const {
1701   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1702   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1703   if (MFI->isEntryFunction())
1704     return;
1705 
1706   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1707   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1708 
1709   // The SP is specifically managed and we don't want extra spills of it.
1710   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1711 
1712   const BitVector AllSavedRegs = SavedRegs;
1713   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1714 
1715   // We have to anticipate introducing CSR VGPR spills or spill of caller
1716   // save VGPR reserved for SGPR spills as we now always create stack entry
1717   // for it, if we don't have any stack objects already, since we require a FP
1718   // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1719   // there are any SGPR spills. Whether they are CSR spills or otherwise.
1720   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1721   const bool WillHaveFP =
1722       FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1723 
1724   // FP will be specially managed like SP.
1725   if (WillHaveFP || hasFP(MF))
1726     SavedRegs.reset(MFI->getFrameOffsetReg());
1727 
1728   // Return address use with return instruction is hidden through the SI_RETURN
1729   // pseudo. Given that and since the IPRA computes actual register usage and
1730   // does not use CSR list, the clobbering of return address by function calls
1731   // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1732   // usage collection. This will ensure save/restore of return address happens
1733   // in those scenarios.
1734   const MachineRegisterInfo &MRI = MF.getRegInfo();
1735   Register RetAddrReg = TRI->getReturnAddressReg(MF);
1736   if (!MFI->isEntryFunction() &&
1737       (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1738     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1739     SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1740   }
1741 }
1742 
1743 static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
1744                                        const GCNSubtarget &ST,
1745                                        std::vector<CalleeSavedInfo> &CSI,
1746                                        unsigned &MinCSFrameIndex,
1747                                        unsigned &MaxCSFrameIndex) {
1748   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1749   MachineFrameInfo &MFI = MF.getFrameInfo();
1750   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1751 
1752   assert(
1753       llvm::is_sorted(CSI,
1754                       [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1755                         return A.getReg() < B.getReg();
1756                       }) &&
1757       "Callee saved registers not sorted");
1758 
1759   auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1760     return !CSI.isSpilledToReg() &&
1761            TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
1762            !FuncInfo->isWWMReservedRegister(CSI.getReg());
1763   };
1764 
1765   auto CSEnd = CSI.end();
1766   for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1767     Register Reg = CSIt->getReg();
1768     if (!CanUseBlockOps(*CSIt))
1769       continue;
1770 
1771     // Find all the regs that will fit in a 32-bit mask starting at the current
1772     // reg and build said mask. It should have 1 for every register that's
1773     // included, with the current register as the least significant bit.
1774     uint32_t Mask = 1;
1775     CSEnd = std::remove_if(
1776         CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
1777           if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
1778             Mask |= 1 << (CSI.getReg() - Reg);
1779             return true;
1780           } else {
1781             return false;
1782           }
1783         });
1784 
1785     const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
1786     Register RegBlock =
1787         TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
1788     if (!RegBlock) {
1789       // We couldn't find a super register for the block. This can happen if
1790       // the register we started with is too high (e.g. v232 if the maximum is
1791       // v255). We therefore try to get the last register block and figure out
1792       // the mask from there.
1793       Register LastBlockStart =
1794           AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
1795       RegBlock =
1796           TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
1797       assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
1798              "Couldn't find super register");
1799       int RegDelta = Reg - LastBlockStart;
1800       assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
1801              "Bad shift amount");
1802       Mask <<= RegDelta;
1803     }
1804 
1805     FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);
1806 
1807     // The stack objects can be a bit smaller than the register block if we know
1808     // some of the high bits of Mask are 0. This may happen often with calling
1809     // conventions where the caller and callee-saved VGPRs are interleaved at
1810     // a small boundary (e.g. 8 or 16).
1811     int UnusedBits = llvm::countl_zero(Mask);
1812     unsigned BlockSize = TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
1813     int FrameIdx =
1814         MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
1815                               /*isSpillSlot=*/true);
1816     if ((unsigned)FrameIdx < MinCSFrameIndex)
1817       MinCSFrameIndex = FrameIdx;
1818     if ((unsigned)FrameIdx > MaxCSFrameIndex)
1819       MaxCSFrameIndex = FrameIdx;
1820 
1821     CSIt->setFrameIdx(FrameIdx);
1822     CSIt->setReg(RegBlock);
1823   }
1824   CSI.erase(CSEnd, CSI.end());
1825 }
1826 
1827 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1828     MachineFunction &MF, const TargetRegisterInfo *TRI,
1829     std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1830     unsigned &MaxCSFrameIndex) const {
1831   if (CSI.empty())
1832     return true; // Early exit if no callee saved registers are modified!
1833 
1834   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1835   bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1836 
1837   if (UseVGPRBlocks)
1838     assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
1839 
1840   return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
1841 }
1842 
1843 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1844     MachineFunction &MF, const TargetRegisterInfo *TRI,
1845     std::vector<CalleeSavedInfo> &CSI) const {
1846   if (CSI.empty())
1847     return true; // Early exit if no callee saved registers are modified!
1848 
1849   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1850   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1851   const SIRegisterInfo *RI = ST.getRegisterInfo();
1852   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1853   Register BasePtrReg = RI->getBaseRegister();
1854   Register SGPRForFPSaveRestoreCopy =
1855       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1856   Register SGPRForBPSaveRestoreCopy =
1857       FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1858   if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1859     return false;
1860 
1861   unsigned NumModifiedRegs = 0;
1862 
1863   if (SGPRForFPSaveRestoreCopy)
1864     NumModifiedRegs++;
1865   if (SGPRForBPSaveRestoreCopy)
1866     NumModifiedRegs++;
1867 
1868   for (auto &CS : CSI) {
1869     if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
1870       CS.setDstReg(SGPRForFPSaveRestoreCopy);
1871       if (--NumModifiedRegs)
1872         break;
1873     } else if (CS.getReg() == BasePtrReg.asMCReg() &&
1874                SGPRForBPSaveRestoreCopy) {
1875       CS.setDstReg(SGPRForBPSaveRestoreCopy);
1876       if (--NumModifiedRegs)
1877         break;
1878     }
1879   }
1880 
1881   return false;
1882 }
1883 
1884 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1885   const MachineFunction &MF) const {
1886 
1887   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1888   const MachineFrameInfo &MFI = MF.getFrameInfo();
1889   const SIInstrInfo *TII = ST.getInstrInfo();
1890   uint64_t EstStackSize = MFI.estimateStackSize(MF);
1891   uint64_t MaxOffset = EstStackSize - 1;
1892 
1893   // We need the emergency stack slots to be allocated in range of the
1894   // MUBUF/flat scratch immediate offset from the base register, so assign these
1895   // first at the incoming SP position.
1896   //
1897   // TODO: We could try sorting the objects to find a hole in the first bytes
1898   // rather than allocating as close to possible. This could save a lot of space
1899   // on frames with alignment requirements.
1900   if (ST.enableFlatScratch()) {
1901     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1902                                SIInstrFlags::FlatScratch))
1903       return false;
1904   } else {
1905     if (TII->isLegalMUBUFImmOffset(MaxOffset))
1906       return false;
1907   }
1908 
1909   return true;
1910 }
1911 
1912 bool SIFrameLowering::spillCalleeSavedRegisters(
1913     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1914     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1915   MachineFunction *MF = MBB.getParent();
1916   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1917   if (!ST.useVGPRBlockOpsForCSR())
1918     return false;
1919 
1920   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1921   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1922   const SIInstrInfo *TII = ST.getInstrInfo();
1923   SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1924 
1925   const TargetRegisterClass *BlockRegClass =
1926       static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(*MF);
1927   for (const CalleeSavedInfo &CS : CSI) {
1928     Register Reg = CS.getReg();
1929     if (!BlockRegClass->contains(Reg) ||
1930         !FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
1931       spillCalleeSavedRegister(MBB, MI, CS, TII, TRI);
1932       continue;
1933     }
1934 
1935     // Build a scratch block store.
1936     uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
1937     int FrameIndex = CS.getFrameIdx();
1938     MachinePointerInfo PtrInfo =
1939         MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1940     MachineMemOperand *MMO =
1941         MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
1942                                  FrameInfo.getObjectSize(FrameIndex),
1943                                  FrameInfo.getObjectAlign(FrameIndex));
1944 
1945     BuildMI(MBB, MI, MI->getDebugLoc(),
1946             TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
1947         .addReg(Reg, getKillRegState(false))
1948         .addFrameIndex(FrameIndex)
1949         .addReg(MFI->getStackPtrOffsetReg())
1950         .addImm(0)
1951         .addImm(Mask)
1952         .addMemOperand(MMO);
1953 
1954     FuncInfo->setHasSpilledVGPRs();
1955 
1956     // Add the register to the liveins. This is necessary because if any of the
1957     // VGPRs in the register block is reserved (e.g. if it's a WWM register),
1958     // then the whole block will be marked as reserved and `updateLiveness` will
1959     // skip it.
1960     MBB.addLiveIn(Reg);
1961   }
1962   MBB.sortUniqueLiveIns();
1963 
1964   return true;
1965 }
1966 
1967 bool SIFrameLowering::restoreCalleeSavedRegisters(
1968     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1969     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1970   MachineFunction *MF = MBB.getParent();
1971   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1972   if (!ST.useVGPRBlockOpsForCSR())
1973     return false;
1974 
1975   SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1976   MachineFrameInfo &MFI = MF->getFrameInfo();
1977   const SIInstrInfo *TII = ST.getInstrInfo();
1978   const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
1979   const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(*MF);
1980   for (const CalleeSavedInfo &CS : reverse(CSI)) {
1981     Register Reg = CS.getReg();
1982     if (!BlockRegClass->contains(Reg) ||
1983         !FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
1984       restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
1985       continue;
1986     }
1987 
1988     // Build a scratch block load.
1989     uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
1990     int FrameIndex = CS.getFrameIdx();
1991     MachinePointerInfo PtrInfo =
1992         MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1993     MachineMemOperand *MMO = MF->getMachineMemOperand(
1994         PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
1995         MFI.getObjectAlign(FrameIndex));
1996 
1997     auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
1998                        TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
1999                    .addFrameIndex(FrameIndex)
2000                    .addReg(FuncInfo->getStackPtrOffsetReg())
2001                    .addImm(0)
2002                    .addImm(Mask)
2003                    .addMemOperand(MMO);
2004     SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);
2005 
2006     // Add the register to the liveins. This is necessary because if any of the
2007     // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2008     // then the whole block will be marked as reserved and `updateLiveness` will
2009     // skip it.
2010     MBB.addLiveIn(Reg);
2011   }
2012 
2013   MBB.sortUniqueLiveIns();
2014   return true;
2015 }
2016 
2017 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
2018   MachineFunction &MF,
2019   MachineBasicBlock &MBB,
2020   MachineBasicBlock::iterator I) const {
2021   int64_t Amount = I->getOperand(0).getImm();
2022   if (Amount == 0)
2023     return MBB.erase(I);
2024 
2025   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2026   const SIInstrInfo *TII = ST.getInstrInfo();
2027   const DebugLoc &DL = I->getDebugLoc();
2028   unsigned Opc = I->getOpcode();
2029   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
2030   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
2031 
2032   if (!hasReservedCallFrame(MF)) {
2033     Amount = alignTo(Amount, getStackAlign());
2034     assert(isUInt<32>(Amount) && "exceeded stack address space size");
2035     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2036     Register SPReg = MFI->getStackPtrOffsetReg();
2037 
2038     Amount *= getScratchScaleFactor(ST);
2039     if (IsDestroy)
2040       Amount = -Amount;
2041     auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
2042         .addReg(SPReg)
2043         .addImm(Amount);
2044     Add->getOperand(3).setIsDead(); // Mark SCC as dead.
2045   } else if (CalleePopAmount != 0) {
2046     llvm_unreachable("is this used?");
2047   }
2048 
2049   return MBB.erase(I);
2050 }
2051 
2052 /// Returns true if the frame will require a reference to the stack pointer.
2053 ///
2054 /// This is the set of conditions common to setting up the stack pointer in a
2055 /// kernel, and for using a frame pointer in a callable function.
2056 ///
2057 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2058 /// references SP.
2059 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
2060   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
2061 }
2062 
2063 // The FP for kernels is always known 0, so we never really need to setup an
2064 // explicit register for it. However, DisableFramePointerElim will force us to
2065 // use a register for it.
2066 bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
2067   const MachineFrameInfo &MFI = MF.getFrameInfo();
2068 
2069   // For entry & chain functions we can use an immediate offset in most cases,
2070   // so the presence of calls doesn't imply we need a distinct frame pointer.
2071   if (MFI.hasCalls() &&
2072       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
2073       !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
2074     // All offsets are unsigned, so need to be addressed in the same direction
2075     // as stack growth.
2076 
2077     // FIXME: This function is pretty broken, since it can be called before the
2078     // frame layout is determined or CSR spills are inserted.
2079     return MFI.getStackSize() != 0;
2080   }
2081 
2082   return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
2083          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
2084              MF) ||
2085          mayReserveScratchForCWSR(MF) ||
2086          MF.getTarget().Options.DisableFramePointerElim(MF);
2087 }
2088 
2089 bool SIFrameLowering::mayReserveScratchForCWSR(
2090     const MachineFunction &MF) const {
2091   return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
2092          AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
2093          AMDGPU::isCompute(MF.getFunction().getCallingConv());
2094 }
2095 
2096 // This is essentially a reduced version of hasFP for entry functions. Since the
2097 // stack pointer is known 0 on entry to kernels, we never really need an FP
2098 // register. We may need to initialize the stack pointer depending on the frame
2099 // properties, which logically overlaps many of the cases where an ordinary
2100 // function would require an FP.
2101 // Also used for chain functions. While not technically entry functions, chain
2102 // functions may need to set up a stack pointer in some situations.
2103 bool SIFrameLowering::requiresStackPointerReference(
2104     const MachineFunction &MF) const {
2105   // Callable functions always require a stack pointer reference.
2106   assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
2107           MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
2108          "only expected to call this for entry points and chain functions");
2109 
2110   const MachineFrameInfo &MFI = MF.getFrameInfo();
2111 
2112   // Entry points ordinarily don't need to initialize SP. We have to set it up
2113   // for callees if there are any. Also note tail calls are impossible/don't
2114   // make any sense for kernels.
2115   if (MFI.hasCalls())
2116     return true;
2117 
2118   // We still need to initialize the SP if we're doing anything weird that
2119   // references the SP, like variable sized stack objects.
2120   return frameTriviallyRequiresSP(MFI);
2121 }
2122