xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUInstPrinter.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegUnits.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 
27 using namespace llvm;
28 
29 #define GET_REGINFO_TARGET_DESC
30 #include "AMDGPUGenRegisterInfo.inc"
31 
32 static cl::opt<bool> EnableSpillSGPRToVGPR(
33   "amdgpu-spill-sgpr-to-vgpr",
34   cl::desc("Enable spilling SGPRs to VGPRs"),
35   cl::ReallyHidden,
36   cl::init(true));
37 
38 std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40 
41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44 //      meaning index 7 in SubRegFromChannelTable.
45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47 
emitUnsupportedError(const Function & Fn,const MachineInstr & MI,const Twine & ErrMsg)48 static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49                                  const Twine &ErrMsg) {
50   Fn.getContext().diagnose(
51       DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52 }
53 
54 namespace llvm {
55 
56 // A temporary struct to spill SGPRs.
57 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58 // just v_writelane and v_readlane.
59 //
60 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61 // is saved to scratch (or the other way around for loads).
62 // For this, a VGPR is required where the needed lanes can be clobbered. The
63 // RegScavenger can provide a VGPR where currently active lanes can be
64 // clobbered, but we still need to save inactive lanes.
65 // The high-level steps are:
66 // - Try to scavenge SGPR(s) to save exec
67 // - Try to scavenge VGPR
68 // - Save needed, all or inactive lanes of a TmpVGPR
69 // - Spill/Restore SGPRs using TmpVGPR
70 // - Restore TmpVGPR
71 //
72 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73 // cannot scavenge temporary SGPRs to save exec, we use the following code:
74 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75 // s_not exec, exec
76 // buffer_store_dword TmpVGPR ; save inactive lanes
77 // s_not exec, exec
78 struct SGPRSpillBuilder {
79   struct PerVGPRData {
80     unsigned PerVGPR;
81     unsigned NumVGPRs;
82     int64_t VGPRLanes;
83   };
84 
85   // The SGPR to save
86   Register SuperReg;
87   MachineBasicBlock::iterator MI;
88   ArrayRef<int16_t> SplitParts;
89   unsigned NumSubRegs;
90   bool IsKill;
91   const DebugLoc &DL;
92 
93   /* When spilling to stack */
94   // The SGPRs are written into this VGPR, which is then written to scratch
95   // (or vice versa for loads).
96   Register TmpVGPR = AMDGPU::NoRegister;
97   // Temporary spill slot to save TmpVGPR to.
98   int TmpVGPRIndex = 0;
99   // If TmpVGPR is live before the spill or if it is scavenged.
100   bool TmpVGPRLive = false;
101   // Scavenged SGPR to save EXEC.
102   Register SavedExecReg = AMDGPU::NoRegister;
103   // Stack index to write the SGPRs to.
104   int Index;
105   unsigned EltSize = 4;
106 
107   RegScavenger *RS;
108   MachineBasicBlock *MBB;
109   MachineFunction &MF;
110   SIMachineFunctionInfo &MFI;
111   const SIInstrInfo &TII;
112   const SIRegisterInfo &TRI;
113   bool IsWave32;
114   Register ExecReg;
115   unsigned MovOpc;
116   unsigned NotOpc;
117 
SGPRSpillBuilderllvm::SGPRSpillBuilder118   SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
119                    bool IsWave32, MachineBasicBlock::iterator MI, int Index,
120                    RegScavenger *RS)
121       : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122                          MI->getOperand(0).isKill(), Index, RS) {}
123 
SGPRSpillBuilderllvm::SGPRSpillBuilder124   SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
125                    bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
126                    bool IsKill, int Index, RegScavenger *RS)
127       : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128         Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129         MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
130         IsWave32(IsWave32) {
131     const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132     SplitParts = TRI.getRegSplitParts(RC, EltSize);
133     NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134 
135     if (IsWave32) {
136       ExecReg = AMDGPU::EXEC_LO;
137       MovOpc = AMDGPU::S_MOV_B32;
138       NotOpc = AMDGPU::S_NOT_B32;
139     } else {
140       ExecReg = AMDGPU::EXEC;
141       MovOpc = AMDGPU::S_MOV_B64;
142       NotOpc = AMDGPU::S_NOT_B64;
143     }
144 
145     assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146     assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147            SuperReg != AMDGPU::EXEC && "exec should never spill");
148   }
149 
getPerVGPRDatallvm::SGPRSpillBuilder150   PerVGPRData getPerVGPRData() {
151     PerVGPRData Data;
152     Data.PerVGPR = IsWave32 ? 32 : 64;
153     Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154     Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155     return Data;
156   }
157 
158   // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159   // free.
160   // Writes these instructions if an SGPR can be scavenged:
161   // s_mov_b64 s[6:7], exec   ; Save exec
162   // s_mov_b64 exec, 3        ; Wanted lanemask
163   // buffer_store_dword v1    ; Write scavenged VGPR to emergency slot
164   //
165   // Writes these instructions if no SGPR can be scavenged:
166   // buffer_store_dword v0    ; Only if no free VGPR was found
167   // s_not_b64 exec, exec
168   // buffer_store_dword v0    ; Save inactive lanes
169   //                          ; exec stays inverted, it is flipped back in
170   //                          ; restore.
preparellvm::SGPRSpillBuilder171   void prepare() {
172     // Scavenged temporary VGPR to use. It must be scavenged once for any number
173     // of spilled subregs.
174     // FIXME: The liveness analysis is limited and does not tell if a register
175     // is in use in lanes that are currently inactive. We can never be sure if
176     // a register as actually in use in another lane, so we need to save all
177     // used lanes of the chosen VGPR.
178     assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179     TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180                                             0, false);
181 
182     // Reserve temporary stack slot
183     TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184     if (TmpVGPR) {
185       // Found a register that is dead in the currently active lanes, we only
186       // need to spill inactive lanes.
187       TmpVGPRLive = false;
188     } else {
189       // Pick v0 because it doesn't make a difference.
190       TmpVGPR = AMDGPU::VGPR0;
191       TmpVGPRLive = true;
192     }
193 
194     if (TmpVGPRLive) {
195       // We need to inform the scavenger that this index is already in use until
196       // we're done with the custom emergency spill.
197       RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198     }
199 
200     // We may end up recursively calling the scavenger, and don't want to re-use
201     // the same register.
202     RS->setRegUsed(TmpVGPR);
203 
204     // Try to scavenge SGPRs to save exec
205     assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206     const TargetRegisterClass &RC =
207         IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208     RS->setRegUsed(SuperReg);
209     SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210 
211     int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212 
213     if (SavedExecReg) {
214       RS->setRegUsed(SavedExecReg);
215       // Set exec to needed lanes
216       BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
217       auto I =
218           BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219       if (!TmpVGPRLive)
220         I.addReg(TmpVGPR, RegState::ImplicitDefine);
221       // Spill needed lanes
222       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223     } else {
224       // The modify and restore of exec clobber SCC, which we would have to save
225       // and restore. FIXME: We probably would need to reserve a register for
226       // this.
227       if (RS->isRegUsed(AMDGPU::SCC))
228         emitUnsupportedError(MF.getFunction(), *MI,
229                              "unhandled SGPR spill to memory");
230 
231       // Spill active lanes
232       if (TmpVGPRLive)
233         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234                                     /*IsKill*/ false);
235       // Spill inactive lanes
236       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237       if (!TmpVGPRLive)
238         I.addReg(TmpVGPR, RegState::ImplicitDefine);
239       I->getOperand(2).setIsDead(); // Mark SCC as dead.
240       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241     }
242   }
243 
244   // Writes these instructions if an SGPR can be scavenged:
245   // buffer_load_dword v1     ; Write scavenged VGPR to emergency slot
246   // s_waitcnt vmcnt(0)       ; If a free VGPR was found
247   // s_mov_b64 exec, s[6:7]   ; Save exec
248   //
249   // Writes these instructions if no SGPR can be scavenged:
250   // buffer_load_dword v0     ; Restore inactive lanes
251   // s_waitcnt vmcnt(0)       ; If a free VGPR was found
252   // s_not_b64 exec, exec
253   // buffer_load_dword v0     ; Only if no free VGPR was found
restorellvm::SGPRSpillBuilder254   void restore() {
255     if (SavedExecReg) {
256       // Restore used lanes
257       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258                                   /*IsKill*/ false);
259       // Restore exec
260       auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
261                    .addReg(SavedExecReg, RegState::Kill);
262       // Add an implicit use of the load so it is not dead.
263       // FIXME This inserts an unnecessary waitcnt
264       if (!TmpVGPRLive) {
265         I.addReg(TmpVGPR, RegState::ImplicitKill);
266       }
267     } else {
268       // Restore inactive lanes
269       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270                                   /*IsKill*/ false);
271       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272       if (!TmpVGPRLive)
273         I.addReg(TmpVGPR, RegState::ImplicitKill);
274       I->getOperand(2).setIsDead(); // Mark SCC as dead.
275 
276       // Restore active lanes
277       if (TmpVGPRLive)
278         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279     }
280 
281     // Inform the scavenger where we're releasing our custom scavenged register.
282     if (TmpVGPRLive) {
283       MachineBasicBlock::iterator RestorePt = std::prev(MI);
284       RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285     }
286   }
287 
288   // Write TmpVGPR to memory or read TmpVGPR from memory.
289   // Either using a single buffer_load/store if exec is set to the needed mask
290   // or using
291   // buffer_load
292   // s_not exec, exec
293   // buffer_load
294   // s_not exec, exec
readWriteTmpVGPRllvm::SGPRSpillBuilder295   void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296     if (SavedExecReg) {
297       // Spill needed lanes
298       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299     } else {
300       // The modify and restore of exec clobber SCC, which we would have to save
301       // and restore. FIXME: We probably would need to reserve a register for
302       // this.
303       if (RS->isRegUsed(AMDGPU::SCC))
304         emitUnsupportedError(MF.getFunction(), *MI,
305                              "unhandled SGPR spill to memory");
306 
307       // Spill active lanes
308       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309                                   /*IsKill*/ false);
310       // Spill inactive lanes
311       auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312       Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314       auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315       Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316     }
317   }
318 
setMIllvm::SGPRSpillBuilder319   void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
320     assert(MBB->getParent() == &MF);
321     MI = NewMI;
322     MBB = NewMBB;
323   }
324 };
325 
326 } // namespace llvm
327 
SIRegisterInfo(const GCNSubtarget & ST)328 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
329     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330                             ST.getAMDGPUDwarfFlavour(),
331                             /*PC=*/0, ST.getHwMode()),
332       ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
333 
334   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
335          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
336          (getSubRegIndexLaneMask(AMDGPU::lo16) |
337           getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
338            getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
339          "getNumCoveredRegs() will not work with generated subreg masks!");
340 
341   RegPressureIgnoredUnits.resize(getNumRegUnits());
342   RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
343   for (auto Reg : AMDGPU::VGPR_16RegClass) {
344     if (AMDGPU::isHi16Reg(Reg, *this))
345       RegPressureIgnoredUnits.set(*regunits(Reg).begin());
346   }
347 
348   // HACK: Until this is fully tablegen'd.
349   static llvm::once_flag InitializeRegSplitPartsFlag;
350 
351   static auto InitializeRegSplitPartsOnce = [this]() {
352     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
353       unsigned Size = getSubRegIdxSize(Idx);
354       if (Size & 15)
355         continue;
356       std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
357       unsigned Pos = getSubRegIdxOffset(Idx);
358       if (Pos % Size)
359         continue;
360       Pos /= Size;
361       if (Vec.empty()) {
362         unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
363         Vec.resize(MaxNumParts);
364       }
365       Vec[Pos] = Idx;
366     }
367   };
368 
369   static llvm::once_flag InitializeSubRegFromChannelTableFlag;
370 
371   static auto InitializeSubRegFromChannelTableOnce = [this]() {
372     for (auto &Row : SubRegFromChannelTable)
373       Row.fill(AMDGPU::NoSubRegister);
374     for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
375       unsigned Width = getSubRegIdxSize(Idx) / 32;
376       unsigned Offset = getSubRegIdxOffset(Idx) / 32;
377       assert(Width < SubRegFromChannelTableWidthMap.size());
378       Width = SubRegFromChannelTableWidthMap[Width];
379       if (Width == 0)
380         continue;
381       unsigned TableIdx = Width - 1;
382       assert(TableIdx < SubRegFromChannelTable.size());
383       assert(Offset < SubRegFromChannelTable[TableIdx].size());
384       SubRegFromChannelTable[TableIdx][Offset] = Idx;
385     }
386   };
387 
388   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
389   llvm::call_once(InitializeSubRegFromChannelTableFlag,
390                   InitializeSubRegFromChannelTableOnce);
391 }
392 
reserveRegisterTuples(BitVector & Reserved,MCRegister Reg) const393 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
394                                            MCRegister Reg) const {
395   for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
396     Reserved.set(*R);
397 }
398 
399 // Forced to be here by one .inc
getCalleeSavedRegs(const MachineFunction * MF) const400 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
401   const MachineFunction *MF) const {
402   CallingConv::ID CC = MF->getFunction().getCallingConv();
403   switch (CC) {
404   case CallingConv::C:
405   case CallingConv::Fast:
406   case CallingConv::Cold:
407     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
408                                : CSR_AMDGPU_SaveList;
409   case CallingConv::AMDGPU_Gfx:
410     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
411                                : CSR_AMDGPU_SI_Gfx_SaveList;
412   case CallingConv::AMDGPU_CS_ChainPreserve:
413     return CSR_AMDGPU_CS_ChainPreserve_SaveList;
414   default: {
415     // Dummy to not crash RegisterClassInfo.
416     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
417     return &NoCalleeSavedReg;
418   }
419   }
420 }
421 
422 const MCPhysReg *
getCalleeSavedRegsViaCopy(const MachineFunction * MF) const423 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
424   return nullptr;
425 }
426 
getCallPreservedMask(const MachineFunction & MF,CallingConv::ID CC) const427 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
428                                                      CallingConv::ID CC) const {
429   switch (CC) {
430   case CallingConv::C:
431   case CallingConv::Fast:
432   case CallingConv::Cold:
433     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
434                                : CSR_AMDGPU_RegMask;
435   case CallingConv::AMDGPU_Gfx:
436     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
437                                : CSR_AMDGPU_SI_Gfx_RegMask;
438   case CallingConv::AMDGPU_CS_Chain:
439   case CallingConv::AMDGPU_CS_ChainPreserve:
440     // Calls to these functions never return, so we can pretend everything is
441     // preserved.
442     return AMDGPU_AllVGPRs_RegMask;
443   default:
444     return nullptr;
445   }
446 }
447 
getNoPreservedMask() const448 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
449   return CSR_AMDGPU_NoRegs_RegMask;
450 }
451 
isChainScratchRegister(Register VGPR)452 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) {
453   return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
454 }
455 
456 const TargetRegisterClass *
getLargestLegalSuperClass(const TargetRegisterClass * RC,const MachineFunction & MF) const457 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
458                                           const MachineFunction &MF) const {
459   // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
460   // equivalent AV class. If used one, the verifier will crash after
461   // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
462   // until Instruction selection.
463   if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
464     if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
465       return &AMDGPU::AV_32RegClass;
466     if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
467       return &AMDGPU::AV_64RegClass;
468     if (RC == &AMDGPU::VReg_64_Align2RegClass ||
469         RC == &AMDGPU::AReg_64_Align2RegClass)
470       return &AMDGPU::AV_64_Align2RegClass;
471     if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
472       return &AMDGPU::AV_96RegClass;
473     if (RC == &AMDGPU::VReg_96_Align2RegClass ||
474         RC == &AMDGPU::AReg_96_Align2RegClass)
475       return &AMDGPU::AV_96_Align2RegClass;
476     if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
477       return &AMDGPU::AV_128RegClass;
478     if (RC == &AMDGPU::VReg_128_Align2RegClass ||
479         RC == &AMDGPU::AReg_128_Align2RegClass)
480       return &AMDGPU::AV_128_Align2RegClass;
481     if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
482       return &AMDGPU::AV_160RegClass;
483     if (RC == &AMDGPU::VReg_160_Align2RegClass ||
484         RC == &AMDGPU::AReg_160_Align2RegClass)
485       return &AMDGPU::AV_160_Align2RegClass;
486     if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
487       return &AMDGPU::AV_192RegClass;
488     if (RC == &AMDGPU::VReg_192_Align2RegClass ||
489         RC == &AMDGPU::AReg_192_Align2RegClass)
490       return &AMDGPU::AV_192_Align2RegClass;
491     if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
492       return &AMDGPU::AV_256RegClass;
493     if (RC == &AMDGPU::VReg_256_Align2RegClass ||
494         RC == &AMDGPU::AReg_256_Align2RegClass)
495       return &AMDGPU::AV_256_Align2RegClass;
496     if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
497       return &AMDGPU::AV_512RegClass;
498     if (RC == &AMDGPU::VReg_512_Align2RegClass ||
499         RC == &AMDGPU::AReg_512_Align2RegClass)
500       return &AMDGPU::AV_512_Align2RegClass;
501     if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
502       return &AMDGPU::AV_1024RegClass;
503     if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
504         RC == &AMDGPU::AReg_1024_Align2RegClass)
505       return &AMDGPU::AV_1024_Align2RegClass;
506   }
507 
508   return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
509 }
510 
getFrameRegister(const MachineFunction & MF) const511 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
512   const SIFrameLowering *TFI = ST.getFrameLowering();
513   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
514 
515   // During ISel lowering we always reserve the stack pointer in entry and chain
516   // functions, but never actually want to reference it when accessing our own
517   // frame. If we need a frame pointer we use it, but otherwise we can just use
518   // an immediate "0" which we represent by returning NoRegister.
519   if (FuncInfo->isBottomOfStack()) {
520     return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
521   }
522   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
523                         : FuncInfo->getStackPtrOffsetReg();
524 }
525 
hasBasePointer(const MachineFunction & MF) const526 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
527   // When we need stack realignment, we can't reference off of the
528   // stack pointer, so we reserve a base pointer.
529   return shouldRealignStack(MF);
530 }
531 
getBaseRegister() const532 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
533 
getAllVGPRRegMask() const534 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
535   return AMDGPU_AllVGPRs_RegMask;
536 }
537 
getAllAGPRRegMask() const538 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
539   return AMDGPU_AllAGPRs_RegMask;
540 }
541 
getAllVectorRegMask() const542 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
543   return AMDGPU_AllVectorRegs_RegMask;
544 }
545 
getAllAllocatableSRegMask() const546 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
547   return AMDGPU_AllAllocatableSRegs_RegMask;
548 }
549 
getSubRegFromChannel(unsigned Channel,unsigned NumRegs)550 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
551                                               unsigned NumRegs) {
552   assert(NumRegs < SubRegFromChannelTableWidthMap.size());
553   unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
554   assert(NumRegIndex && "Not implemented");
555   assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
556   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
557 }
558 
559 MCRegister
getAlignedHighSGPRForRC(const MachineFunction & MF,const unsigned Align,const TargetRegisterClass * RC) const560 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
561                                         const unsigned Align,
562                                         const TargetRegisterClass *RC) const {
563   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
564   MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
565   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
566 }
567 
reservedPrivateSegmentBufferReg(const MachineFunction & MF) const568 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
569   const MachineFunction &MF) const {
570   return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
571 }
572 
573 std::pair<unsigned, unsigned>
getMaxNumVectorRegs(const MachineFunction & MF) const574 SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
575   const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
576 
577   unsigned MaxNumVGPRs = MaxVectorRegs;
578   unsigned MaxNumAGPRs = 0;
579 
580   // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
581   // a wave may have up to 512 total vector registers combining together both
582   // VGPRs and AGPRs. Hence, in an entry function without calls and without
583   // AGPRs used within it, it is possible to use the whole vector register
584   // budget for VGPRs.
585   //
586   // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
587   //       register file accordingly.
588   if (ST.hasGFX90AInsts()) {
589     unsigned MinNumAGPRs = 0;
590     const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
591     const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
592 
593     const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
594 
595     // TODO: Move this logic into subtarget on IR function
596     //
597     // TODO: The lower bound should probably force the number of required
598     // registers up, overriding amdgpu-waves-per-eu.
599     std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
600         MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
601         /*OnlyFirstRequired=*/true);
602 
603     if (MinNumAGPRs == DefaultNumAGPR.first) {
604       // Default to splitting half the registers if AGPRs are required.
605       MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
606     } else {
607       // Align to accum_offset's allocation granularity.
608       MinNumAGPRs = alignTo(MinNumAGPRs, 4);
609 
610       MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
611     }
612 
613     // Clamp values to be inbounds of our limits, and ensure min <= max.
614 
615     MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
616     MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
617 
618     MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
619     MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
620 
621     assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
622            MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
623            "invalid register counts");
624   } else if (ST.hasMAIInsts()) {
625     // On gfx908 the number of AGPRs always equals the number of VGPRs.
626     MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
627   }
628 
629   return std::pair(MaxNumVGPRs, MaxNumAGPRs);
630 }
631 
getReservedRegs(const MachineFunction & MF) const632 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
633   BitVector Reserved(getNumRegs());
634   Reserved.set(AMDGPU::MODE);
635 
636   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
637 
638   // Reserve special purpose registers.
639   //
640   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
641   // this seems likely to result in bugs, so I'm marking them as reserved.
642   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
643   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
644 
645   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
646   reserveRegisterTuples(Reserved, AMDGPU::M0);
647 
648   // Reserve src_vccz, src_execz, src_scc.
649   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
650   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
651   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
652 
653   // Reserve the memory aperture registers
654   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
655   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
656   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
657   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
658 
659   // Reserve async counters pseudo registers
660   reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
661   reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
662 
663   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
664   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
665 
666   // Reserve xnack_mask registers - support is not implemented in Codegen.
667   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
668 
669   // Reserve lds_direct register - support is not implemented in Codegen.
670   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
671 
672   // Reserve Trap Handler registers - support is not implemented in Codegen.
673   reserveRegisterTuples(Reserved, AMDGPU::TBA);
674   reserveRegisterTuples(Reserved, AMDGPU::TMA);
675   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
676   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
677   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
678   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
679   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
680   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
681   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
682   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
683 
684   // Reserve null register - it shall never be allocated
685   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
686 
687   // Reserve SGPRs.
688   //
689   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
690   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
691   for (const TargetRegisterClass *RC : regclasses()) {
692     if (RC->isBaseClass() && isSGPRClass(RC)) {
693       unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
694       for (MCPhysReg Reg : *RC) {
695         unsigned Index = getHWRegIndex(Reg);
696         if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
697           Reserved.set(Reg);
698       }
699     }
700   }
701 
702   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
703   if (ScratchRSrcReg != AMDGPU::NoRegister) {
704     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
705     // need to spill.
706     // TODO: May need to reserve a VGPR if doing LDS spilling.
707     reserveRegisterTuples(Reserved, ScratchRSrcReg);
708   }
709 
710   Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
711   if (LongBranchReservedReg)
712     reserveRegisterTuples(Reserved, LongBranchReservedReg);
713 
714   // We have to assume the SP is needed in case there are calls in the function,
715   // which is detected after the function is lowered. If we aren't really going
716   // to need SP, don't bother reserving it.
717   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
718   if (StackPtrReg) {
719     reserveRegisterTuples(Reserved, StackPtrReg);
720     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
721   }
722 
723   MCRegister FrameReg = MFI->getFrameOffsetReg();
724   if (FrameReg) {
725     reserveRegisterTuples(Reserved, FrameReg);
726     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
727   }
728 
729   if (hasBasePointer(MF)) {
730     MCRegister BasePtrReg = getBaseRegister();
731     reserveRegisterTuples(Reserved, BasePtrReg);
732     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
733   }
734 
735   // FIXME: Use same reserved register introduced in D149775
736   // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
737   Register ExecCopyReg = MFI->getSGPRForEXECCopy();
738   if (ExecCopyReg)
739     reserveRegisterTuples(Reserved, ExecCopyReg);
740 
741   // Reserve VGPRs/AGPRs.
742   //
743   auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
744 
745   for (const TargetRegisterClass *RC : regclasses()) {
746     if (RC->isBaseClass() && isVGPRClass(RC)) {
747       unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
748       for (MCPhysReg Reg : *RC) {
749         unsigned Index = getHWRegIndex(Reg);
750         if (Index + NumRegs > MaxNumVGPRs)
751           Reserved.set(Reg);
752       }
753     }
754   }
755 
756   // Reserve all the AGPRs if there are no instructions to use it.
757   if (!ST.hasMAIInsts())
758     MaxNumAGPRs = 0;
759   for (const TargetRegisterClass *RC : regclasses()) {
760     if (RC->isBaseClass() && isAGPRClass(RC)) {
761       unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
762       for (MCPhysReg Reg : *RC) {
763         unsigned Index = getHWRegIndex(Reg);
764         if (Index + NumRegs > MaxNumAGPRs)
765           Reserved.set(Reg);
766       }
767     }
768   }
769 
770   // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
771   // VGPR available at all times.
772   if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
773     reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
774   }
775 
776   // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
777   // MFI->getNonWWMRegMask() field will have a valid bitmask only during
778   // wwm-regalloc and it would be empty otherwise.
779   BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
780   if (!NonWWMRegMask.empty()) {
781     for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
782          RegI < RegE; ++RegI) {
783       if (NonWWMRegMask.test(RegI))
784         reserveRegisterTuples(Reserved, RegI);
785     }
786   }
787 
788   for (Register Reg : MFI->getWWMReservedRegs())
789     reserveRegisterTuples(Reserved, Reg);
790 
791   // FIXME: Stop using reserved registers for this.
792   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
793     reserveRegisterTuples(Reserved, Reg);
794 
795   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
796     reserveRegisterTuples(Reserved, Reg);
797 
798   return Reserved;
799 }
800 
isAsmClobberable(const MachineFunction & MF,MCRegister PhysReg) const801 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
802                                       MCRegister PhysReg) const {
803   return !MF.getRegInfo().isReserved(PhysReg);
804 }
805 
shouldRealignStack(const MachineFunction & MF) const806 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
807   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
808   // On entry or in chain functions, the base address is 0, so it can't possibly
809   // need any more alignment.
810 
811   // FIXME: Should be able to specify the entry frame alignment per calling
812   // convention instead.
813   if (Info->isBottomOfStack())
814     return false;
815 
816   return TargetRegisterInfo::shouldRealignStack(MF);
817 }
818 
requiresRegisterScavenging(const MachineFunction & Fn) const819 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
820   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
821   if (Info->isEntryFunction()) {
822     const MachineFrameInfo &MFI = Fn.getFrameInfo();
823     return MFI.hasStackObjects() || MFI.hasCalls();
824   }
825 
826   // May need scavenger for dealing with callee saved registers.
827   return true;
828 }
829 
requiresFrameIndexScavenging(const MachineFunction & MF) const830 bool SIRegisterInfo::requiresFrameIndexScavenging(
831   const MachineFunction &MF) const {
832   // Do not use frame virtual registers. They used to be used for SGPRs, but
833   // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
834   // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
835   // spill.
836   return false;
837 }
838 
requiresFrameIndexReplacementScavenging(const MachineFunction & MF) const839 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
840   const MachineFunction &MF) const {
841   const MachineFrameInfo &MFI = MF.getFrameInfo();
842   return MFI.hasStackObjects();
843 }
844 
requiresVirtualBaseRegisters(const MachineFunction &) const845 bool SIRegisterInfo::requiresVirtualBaseRegisters(
846   const MachineFunction &) const {
847   // There are no special dedicated stack or frame pointers.
848   return true;
849 }
850 
getScratchInstrOffset(const MachineInstr * MI) const851 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
852   assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
853 
854   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
855                                           AMDGPU::OpName::offset);
856   return MI->getOperand(OffIdx).getImm();
857 }
858 
getFrameIndexInstrOffset(const MachineInstr * MI,int Idx) const859 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
860                                                  int Idx) const {
861   switch (MI->getOpcode()) {
862   case AMDGPU::V_ADD_U32_e32:
863   case AMDGPU::V_ADD_U32_e64:
864   case AMDGPU::V_ADD_CO_U32_e32: {
865     int OtherIdx = Idx == 1 ? 2 : 1;
866     const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
867     return OtherOp.isImm() ? OtherOp.getImm() : 0;
868   }
869   case AMDGPU::V_ADD_CO_U32_e64: {
870     int OtherIdx = Idx == 2 ? 3 : 2;
871     const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
872     return OtherOp.isImm() ? OtherOp.getImm() : 0;
873   }
874   default:
875     break;
876   }
877 
878   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
879     return 0;
880 
881   assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
882                                             AMDGPU::OpName::vaddr) ||
883          (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
884                                             AMDGPU::OpName::saddr))) &&
885          "Should never see frame index on non-address operand");
886 
887   return getScratchInstrOffset(MI);
888 }
889 
isFIPlusImmOrVGPR(const SIRegisterInfo & TRI,const MachineInstr & MI)890 static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI,
891                               const MachineInstr &MI) {
892   assert(MI.getDesc().isAdd());
893   const MachineOperand &Src0 = MI.getOperand(1);
894   const MachineOperand &Src1 = MI.getOperand(2);
895 
896   if (Src0.isFI()) {
897     return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
898                                                        Src1.getReg()));
899   }
900 
901   if (Src1.isFI()) {
902     return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
903                                                        Src0.getReg()));
904   }
905 
906   return false;
907 }
908 
needsFrameBaseReg(MachineInstr * MI,int64_t Offset) const909 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
910   // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
911   switch (MI->getOpcode()) {
912   case AMDGPU::V_ADD_U32_e32: {
913     // TODO: We could handle this but it requires work to avoid violating
914     // operand restrictions.
915     if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
916         !isFIPlusImmOrVGPR(*this, *MI))
917       return false;
918     [[fallthrough]];
919   }
920   case AMDGPU::V_ADD_U32_e64:
921     // FIXME: This optimization is barely profitable enableFlatScratch as-is.
922     //
923     // Much of the benefit with the MUBUF handling is we avoid duplicating the
924     // shift of the frame register, which isn't needed with scratch.
925     //
926     // materializeFrameBaseRegister doesn't know the register classes of the
927     // uses, and unconditionally uses an s_add_i32, which will end up using a
928     // copy for the vector uses.
929     return !ST.enableFlatScratch();
930   case AMDGPU::V_ADD_CO_U32_e32:
931     if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
932         !isFIPlusImmOrVGPR(*this, *MI))
933       return false;
934     // We can't deal with the case where the carry out has a use (though this
935     // should never happen)
936     return MI->getOperand(3).isDead();
937   case AMDGPU::V_ADD_CO_U32_e64:
938     // TODO: Should we check use_empty instead?
939     return MI->getOperand(1).isDead();
940   default:
941     break;
942   }
943 
944   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
945     return false;
946 
947   int64_t FullOffset = Offset + getScratchInstrOffset(MI);
948 
949   const SIInstrInfo *TII = ST.getInstrInfo();
950   if (SIInstrInfo::isMUBUF(*MI))
951     return !TII->isLegalMUBUFImmOffset(FullOffset);
952 
953   return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
954                                  SIInstrFlags::FlatScratch);
955 }
956 
materializeFrameBaseRegister(MachineBasicBlock * MBB,int FrameIdx,int64_t Offset) const957 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
958                                                       int FrameIdx,
959                                                       int64_t Offset) const {
960   MachineBasicBlock::iterator Ins = MBB->begin();
961   DebugLoc DL; // Defaults to "unknown"
962 
963   if (Ins != MBB->end())
964     DL = Ins->getDebugLoc();
965 
966   MachineFunction *MF = MBB->getParent();
967   const SIInstrInfo *TII = ST.getInstrInfo();
968   MachineRegisterInfo &MRI = MF->getRegInfo();
969   unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
970                                            : AMDGPU::V_MOV_B32_e32;
971 
972   Register BaseReg = MRI.createVirtualRegister(
973       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
974                              : &AMDGPU::VGPR_32RegClass);
975 
976   if (Offset == 0) {
977     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
978       .addFrameIndex(FrameIdx);
979     return BaseReg;
980   }
981 
982   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
983 
984   Register FIReg = MRI.createVirtualRegister(
985       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
986                              : &AMDGPU::VGPR_32RegClass);
987 
988   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
989     .addImm(Offset);
990   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
991     .addFrameIndex(FrameIdx);
992 
993   if (ST.enableFlatScratch() ) {
994     // FIXME: Make sure scc isn't live in.
995     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
996         .addReg(OffsetReg, RegState::Kill)
997         .addReg(FIReg)
998         .setOperandDead(3); // scc
999     return BaseReg;
1000   }
1001 
1002   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
1003     .addReg(OffsetReg, RegState::Kill)
1004     .addReg(FIReg)
1005     .addImm(0); // clamp bit
1006 
1007   return BaseReg;
1008 }
1009 
resolveFrameIndex(MachineInstr & MI,Register BaseReg,int64_t Offset) const1010 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
1011                                        int64_t Offset) const {
1012   const SIInstrInfo *TII = ST.getInstrInfo();
1013 
1014   switch (MI.getOpcode()) {
1015   case AMDGPU::V_ADD_U32_e32:
1016   case AMDGPU::V_ADD_CO_U32_e32: {
1017     MachineOperand *FIOp = &MI.getOperand(2);
1018     MachineOperand *ImmOp = &MI.getOperand(1);
1019     if (!FIOp->isFI())
1020       std::swap(FIOp, ImmOp);
1021 
1022     if (!ImmOp->isImm()) {
1023       assert(Offset == 0);
1024       FIOp->ChangeToRegister(BaseReg, false);
1025       TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
1026       return;
1027     }
1028 
1029     int64_t TotalOffset = ImmOp->getImm() + Offset;
1030     if (TotalOffset == 0) {
1031       MI.setDesc(TII->get(AMDGPU::COPY));
1032       for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1033         MI.removeOperand(I);
1034 
1035       MI.getOperand(1).ChangeToRegister(BaseReg, false);
1036       return;
1037     }
1038 
1039     ImmOp->setImm(TotalOffset);
1040 
1041     MachineBasicBlock *MBB = MI.getParent();
1042     MachineFunction *MF = MBB->getParent();
1043     MachineRegisterInfo &MRI = MF->getRegInfo();
1044 
1045     // FIXME: materializeFrameBaseRegister does not know the register class of
1046     // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
1047     // a copy so we have a legal operand and hope the register coalescer can
1048     // clean it up.
1049     if (isSGPRReg(MRI, BaseReg)) {
1050       Register BaseRegVGPR =
1051           MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1052       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1053           .addReg(BaseReg);
1054       MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1055     } else {
1056       MI.getOperand(2).ChangeToRegister(BaseReg, false);
1057     }
1058     return;
1059   }
1060   case AMDGPU::V_ADD_U32_e64:
1061   case AMDGPU::V_ADD_CO_U32_e64: {
1062     int Src0Idx = MI.getNumExplicitDefs();
1063     MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1064     MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1065     if (!FIOp->isFI())
1066       std::swap(FIOp, ImmOp);
1067 
1068     if (!ImmOp->isImm()) {
1069       FIOp->ChangeToRegister(BaseReg, false);
1070       TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1071       return;
1072     }
1073 
1074     int64_t TotalOffset = ImmOp->getImm() + Offset;
1075     if (TotalOffset == 0) {
1076       MI.setDesc(TII->get(AMDGPU::COPY));
1077 
1078       for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1079         MI.removeOperand(I);
1080 
1081       MI.getOperand(1).ChangeToRegister(BaseReg, false);
1082     } else {
1083       FIOp->ChangeToRegister(BaseReg, false);
1084       ImmOp->setImm(TotalOffset);
1085     }
1086 
1087     return;
1088   }
1089   default:
1090     break;
1091   }
1092 
1093   bool IsFlat = TII->isFLATScratch(MI);
1094 
1095 #ifndef NDEBUG
1096   // FIXME: Is it possible to be storing a frame index to itself?
1097   bool SeenFI = false;
1098   for (const MachineOperand &MO: MI.operands()) {
1099     if (MO.isFI()) {
1100       if (SeenFI)
1101         llvm_unreachable("should not see multiple frame indices");
1102 
1103       SeenFI = true;
1104     }
1105   }
1106 #endif
1107 
1108   MachineOperand *FIOp =
1109       TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1110                                       : AMDGPU::OpName::vaddr);
1111 
1112   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1113   int64_t NewOffset = OffsetOp->getImm() + Offset;
1114 
1115   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1116   assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1117 
1118   if (IsFlat) {
1119     assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1120                                   SIInstrFlags::FlatScratch) &&
1121            "offset should be legal");
1122     FIOp->ChangeToRegister(BaseReg, false);
1123     OffsetOp->setImm(NewOffset);
1124     return;
1125   }
1126 
1127 #ifndef NDEBUG
1128   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1129   assert(SOffset->isImm() && SOffset->getImm() == 0);
1130 #endif
1131 
1132   assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1133 
1134   FIOp->ChangeToRegister(BaseReg, false);
1135   OffsetOp->setImm(NewOffset);
1136 }
1137 
isFrameOffsetLegal(const MachineInstr * MI,Register BaseReg,int64_t Offset) const1138 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
1139                                         Register BaseReg,
1140                                         int64_t Offset) const {
1141 
1142   switch (MI->getOpcode()) {
1143   case AMDGPU::V_ADD_U32_e32:
1144   case AMDGPU::V_ADD_CO_U32_e32:
1145     return true;
1146   case AMDGPU::V_ADD_U32_e64:
1147   case AMDGPU::V_ADD_CO_U32_e64:
1148     return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1149   default:
1150     break;
1151   }
1152 
1153   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
1154     return false;
1155 
1156   int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1157 
1158   const SIInstrInfo *TII = ST.getInstrInfo();
1159   if (SIInstrInfo::isMUBUF(*MI))
1160     return TII->isLegalMUBUFImmOffset(NewOffset);
1161 
1162   return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1163                                 SIInstrFlags::FlatScratch);
1164 }
1165 
getPointerRegClass(const MachineFunction & MF,unsigned Kind) const1166 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
1167   const MachineFunction &MF, unsigned Kind) const {
1168   // This is inaccurate. It depends on the instruction and address space. The
1169   // only place where we should hit this is for dealing with frame indexes /
1170   // private accesses, so this is correct in that case.
1171   return &AMDGPU::VGPR_32RegClass;
1172 }
1173 
1174 const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass * RC) const1175 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
1176   if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
1177     return getEquivalentVGPRClass(RC);
1178   if (RC == &AMDGPU::SCC_CLASSRegClass)
1179     return getWaveMaskRegClass();
1180 
1181   return RC;
1182 }
1183 
getNumSubRegsForSpillOp(const MachineInstr & MI,const SIInstrInfo * TII)1184 static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI,
1185                                         const SIInstrInfo *TII) {
1186 
1187   unsigned Op = MI.getOpcode();
1188   switch (Op) {
1189   case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1190   case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1191     // FIXME: This assumes the mask is statically known and not computed at
1192     // runtime. However, some ABIs may want to compute the mask dynamically and
1193     // this will need to be updated.
1194     return llvm::popcount(
1195         (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1196   case AMDGPU::SI_SPILL_S1024_SAVE:
1197   case AMDGPU::SI_SPILL_S1024_RESTORE:
1198   case AMDGPU::SI_SPILL_V1024_SAVE:
1199   case AMDGPU::SI_SPILL_V1024_RESTORE:
1200   case AMDGPU::SI_SPILL_A1024_SAVE:
1201   case AMDGPU::SI_SPILL_A1024_RESTORE:
1202   case AMDGPU::SI_SPILL_AV1024_SAVE:
1203   case AMDGPU::SI_SPILL_AV1024_RESTORE:
1204     return 32;
1205   case AMDGPU::SI_SPILL_S512_SAVE:
1206   case AMDGPU::SI_SPILL_S512_RESTORE:
1207   case AMDGPU::SI_SPILL_V512_SAVE:
1208   case AMDGPU::SI_SPILL_V512_RESTORE:
1209   case AMDGPU::SI_SPILL_A512_SAVE:
1210   case AMDGPU::SI_SPILL_A512_RESTORE:
1211   case AMDGPU::SI_SPILL_AV512_SAVE:
1212   case AMDGPU::SI_SPILL_AV512_RESTORE:
1213     return 16;
1214   case AMDGPU::SI_SPILL_S384_SAVE:
1215   case AMDGPU::SI_SPILL_S384_RESTORE:
1216   case AMDGPU::SI_SPILL_V384_SAVE:
1217   case AMDGPU::SI_SPILL_V384_RESTORE:
1218   case AMDGPU::SI_SPILL_A384_SAVE:
1219   case AMDGPU::SI_SPILL_A384_RESTORE:
1220   case AMDGPU::SI_SPILL_AV384_SAVE:
1221   case AMDGPU::SI_SPILL_AV384_RESTORE:
1222     return 12;
1223   case AMDGPU::SI_SPILL_S352_SAVE:
1224   case AMDGPU::SI_SPILL_S352_RESTORE:
1225   case AMDGPU::SI_SPILL_V352_SAVE:
1226   case AMDGPU::SI_SPILL_V352_RESTORE:
1227   case AMDGPU::SI_SPILL_A352_SAVE:
1228   case AMDGPU::SI_SPILL_A352_RESTORE:
1229   case AMDGPU::SI_SPILL_AV352_SAVE:
1230   case AMDGPU::SI_SPILL_AV352_RESTORE:
1231     return 11;
1232   case AMDGPU::SI_SPILL_S320_SAVE:
1233   case AMDGPU::SI_SPILL_S320_RESTORE:
1234   case AMDGPU::SI_SPILL_V320_SAVE:
1235   case AMDGPU::SI_SPILL_V320_RESTORE:
1236   case AMDGPU::SI_SPILL_A320_SAVE:
1237   case AMDGPU::SI_SPILL_A320_RESTORE:
1238   case AMDGPU::SI_SPILL_AV320_SAVE:
1239   case AMDGPU::SI_SPILL_AV320_RESTORE:
1240     return 10;
1241   case AMDGPU::SI_SPILL_S288_SAVE:
1242   case AMDGPU::SI_SPILL_S288_RESTORE:
1243   case AMDGPU::SI_SPILL_V288_SAVE:
1244   case AMDGPU::SI_SPILL_V288_RESTORE:
1245   case AMDGPU::SI_SPILL_A288_SAVE:
1246   case AMDGPU::SI_SPILL_A288_RESTORE:
1247   case AMDGPU::SI_SPILL_AV288_SAVE:
1248   case AMDGPU::SI_SPILL_AV288_RESTORE:
1249     return 9;
1250   case AMDGPU::SI_SPILL_S256_SAVE:
1251   case AMDGPU::SI_SPILL_S256_RESTORE:
1252   case AMDGPU::SI_SPILL_V256_SAVE:
1253   case AMDGPU::SI_SPILL_V256_RESTORE:
1254   case AMDGPU::SI_SPILL_A256_SAVE:
1255   case AMDGPU::SI_SPILL_A256_RESTORE:
1256   case AMDGPU::SI_SPILL_AV256_SAVE:
1257   case AMDGPU::SI_SPILL_AV256_RESTORE:
1258     return 8;
1259   case AMDGPU::SI_SPILL_S224_SAVE:
1260   case AMDGPU::SI_SPILL_S224_RESTORE:
1261   case AMDGPU::SI_SPILL_V224_SAVE:
1262   case AMDGPU::SI_SPILL_V224_RESTORE:
1263   case AMDGPU::SI_SPILL_A224_SAVE:
1264   case AMDGPU::SI_SPILL_A224_RESTORE:
1265   case AMDGPU::SI_SPILL_AV224_SAVE:
1266   case AMDGPU::SI_SPILL_AV224_RESTORE:
1267     return 7;
1268   case AMDGPU::SI_SPILL_S192_SAVE:
1269   case AMDGPU::SI_SPILL_S192_RESTORE:
1270   case AMDGPU::SI_SPILL_V192_SAVE:
1271   case AMDGPU::SI_SPILL_V192_RESTORE:
1272   case AMDGPU::SI_SPILL_A192_SAVE:
1273   case AMDGPU::SI_SPILL_A192_RESTORE:
1274   case AMDGPU::SI_SPILL_AV192_SAVE:
1275   case AMDGPU::SI_SPILL_AV192_RESTORE:
1276     return 6;
1277   case AMDGPU::SI_SPILL_S160_SAVE:
1278   case AMDGPU::SI_SPILL_S160_RESTORE:
1279   case AMDGPU::SI_SPILL_V160_SAVE:
1280   case AMDGPU::SI_SPILL_V160_RESTORE:
1281   case AMDGPU::SI_SPILL_A160_SAVE:
1282   case AMDGPU::SI_SPILL_A160_RESTORE:
1283   case AMDGPU::SI_SPILL_AV160_SAVE:
1284   case AMDGPU::SI_SPILL_AV160_RESTORE:
1285     return 5;
1286   case AMDGPU::SI_SPILL_S128_SAVE:
1287   case AMDGPU::SI_SPILL_S128_RESTORE:
1288   case AMDGPU::SI_SPILL_V128_SAVE:
1289   case AMDGPU::SI_SPILL_V128_RESTORE:
1290   case AMDGPU::SI_SPILL_A128_SAVE:
1291   case AMDGPU::SI_SPILL_A128_RESTORE:
1292   case AMDGPU::SI_SPILL_AV128_SAVE:
1293   case AMDGPU::SI_SPILL_AV128_RESTORE:
1294     return 4;
1295   case AMDGPU::SI_SPILL_S96_SAVE:
1296   case AMDGPU::SI_SPILL_S96_RESTORE:
1297   case AMDGPU::SI_SPILL_V96_SAVE:
1298   case AMDGPU::SI_SPILL_V96_RESTORE:
1299   case AMDGPU::SI_SPILL_A96_SAVE:
1300   case AMDGPU::SI_SPILL_A96_RESTORE:
1301   case AMDGPU::SI_SPILL_AV96_SAVE:
1302   case AMDGPU::SI_SPILL_AV96_RESTORE:
1303     return 3;
1304   case AMDGPU::SI_SPILL_S64_SAVE:
1305   case AMDGPU::SI_SPILL_S64_RESTORE:
1306   case AMDGPU::SI_SPILL_V64_SAVE:
1307   case AMDGPU::SI_SPILL_V64_RESTORE:
1308   case AMDGPU::SI_SPILL_A64_SAVE:
1309   case AMDGPU::SI_SPILL_A64_RESTORE:
1310   case AMDGPU::SI_SPILL_AV64_SAVE:
1311   case AMDGPU::SI_SPILL_AV64_RESTORE:
1312     return 2;
1313   case AMDGPU::SI_SPILL_S32_SAVE:
1314   case AMDGPU::SI_SPILL_S32_RESTORE:
1315   case AMDGPU::SI_SPILL_V32_SAVE:
1316   case AMDGPU::SI_SPILL_V32_RESTORE:
1317   case AMDGPU::SI_SPILL_A32_SAVE:
1318   case AMDGPU::SI_SPILL_A32_RESTORE:
1319   case AMDGPU::SI_SPILL_AV32_SAVE:
1320   case AMDGPU::SI_SPILL_AV32_RESTORE:
1321   case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1322   case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1323   case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1324   case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1325   case AMDGPU::SI_SPILL_V16_SAVE:
1326   case AMDGPU::SI_SPILL_V16_RESTORE:
1327     return 1;
1328   default: llvm_unreachable("Invalid spill opcode");
1329   }
1330 }
1331 
getOffsetMUBUFStore(unsigned Opc)1332 static int getOffsetMUBUFStore(unsigned Opc) {
1333   switch (Opc) {
1334   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1335     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1336   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1337     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1338   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1339     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1340   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1341     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1342   case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1343     return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1344   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1345     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1346   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1347     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1348   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1349     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1350   default:
1351     return -1;
1352   }
1353 }
1354 
getOffsetMUBUFLoad(unsigned Opc)1355 static int getOffsetMUBUFLoad(unsigned Opc) {
1356   switch (Opc) {
1357   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1358     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1359   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1360     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1361   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1362     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1363   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1364     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1365   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1366     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1367   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1368     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1369   case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1370     return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1371   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1372     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1373   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1374     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1375   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1376     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1377   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1378     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1379   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1380     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1381   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1382     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1383   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1384     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1385   default:
1386     return -1;
1387   }
1388 }
1389 
getOffenMUBUFStore(unsigned Opc)1390 static int getOffenMUBUFStore(unsigned Opc) {
1391   switch (Opc) {
1392   case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1393     return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1394   case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1395     return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1396   case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1397     return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1398   case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1399     return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1400   case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1401     return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1402   case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1403     return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1404   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1405     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1406   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1407     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1408   default:
1409     return -1;
1410   }
1411 }
1412 
getOffenMUBUFLoad(unsigned Opc)1413 static int getOffenMUBUFLoad(unsigned Opc) {
1414   switch (Opc) {
1415   case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1416     return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1417   case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1418     return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1419   case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1420     return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1421   case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1422     return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1423   case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1424     return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1425   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1426     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1427   case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1428     return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1429   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1430     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1431   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1432     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1433   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1434     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1435   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1436     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1437   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1438     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1439   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1440     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1441   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1442     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1443   default:
1444     return -1;
1445   }
1446 }
1447 
spillVGPRtoAGPR(const GCNSubtarget & ST,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,int Index,unsigned Lane,unsigned ValueReg,bool IsKill)1448 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
1449                                            MachineBasicBlock &MBB,
1450                                            MachineBasicBlock::iterator MI,
1451                                            int Index, unsigned Lane,
1452                                            unsigned ValueReg, bool IsKill) {
1453   MachineFunction *MF = MBB.getParent();
1454   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1455   const SIInstrInfo *TII = ST.getInstrInfo();
1456 
1457   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1458 
1459   if (Reg == AMDGPU::NoRegister)
1460     return MachineInstrBuilder();
1461 
1462   bool IsStore = MI->mayStore();
1463   MachineRegisterInfo &MRI = MF->getRegInfo();
1464   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1465 
1466   unsigned Dst = IsStore ? Reg : ValueReg;
1467   unsigned Src = IsStore ? ValueReg : Reg;
1468   bool IsVGPR = TRI->isVGPR(MRI, Reg);
1469   DebugLoc DL = MI->getDebugLoc();
1470   if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1471     // Spiller during regalloc may restore a spilled register to its superclass.
1472     // It could result in AGPR spills restored to VGPRs or the other way around,
1473     // making the src and dst with identical regclasses at this point. It just
1474     // needs a copy in such cases.
1475     auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1476                        .addReg(Src, getKillRegState(IsKill));
1477     CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1478     return CopyMIB;
1479   }
1480   unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1481                                     : AMDGPU::V_ACCVGPR_READ_B32_e64;
1482 
1483   auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1484                  .addReg(Src, getKillRegState(IsKill));
1485   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1486   return MIB;
1487 }
1488 
1489 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1490 // need to handle the case where an SGPR may need to be spilled while spilling.
buildMUBUFOffsetLoadStore(const GCNSubtarget & ST,MachineFrameInfo & MFI,MachineBasicBlock::iterator MI,int Index,int64_t Offset)1491 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
1492                                       MachineFrameInfo &MFI,
1493                                       MachineBasicBlock::iterator MI,
1494                                       int Index,
1495                                       int64_t Offset) {
1496   const SIInstrInfo *TII = ST.getInstrInfo();
1497   MachineBasicBlock *MBB = MI->getParent();
1498   const DebugLoc &DL = MI->getDebugLoc();
1499   bool IsStore = MI->mayStore();
1500 
1501   unsigned Opc = MI->getOpcode();
1502   int LoadStoreOp = IsStore ?
1503     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
1504   if (LoadStoreOp == -1)
1505     return false;
1506 
1507   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1508   if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1509     return true;
1510 
1511   MachineInstrBuilder NewMI =
1512       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1513           .add(*Reg)
1514           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1515           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1516           .addImm(Offset)
1517           .addImm(0) // cpol
1518           .addImm(0) // swz
1519           .cloneMemRefs(*MI);
1520 
1521   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1522                                                        AMDGPU::OpName::vdata_in);
1523   if (VDataIn)
1524     NewMI.add(*VDataIn);
1525   return true;
1526 }
1527 
getFlatScratchSpillOpcode(const SIInstrInfo * TII,unsigned LoadStoreOp,unsigned EltSize)1528 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
1529                                           unsigned LoadStoreOp,
1530                                           unsigned EltSize) {
1531   bool IsStore = TII->get(LoadStoreOp).mayStore();
1532   bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1533   bool UseST =
1534       !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1535 
1536   // Handle block load/store first.
1537   if (TII->isBlockLoadStore(LoadStoreOp))
1538     return LoadStoreOp;
1539 
1540   switch (EltSize) {
1541   case 4:
1542     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1543                           : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1544     break;
1545   case 8:
1546     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1547                           : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1548     break;
1549   case 12:
1550     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1551                           : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1552     break;
1553   case 16:
1554     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1555                           : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1556     break;
1557   default:
1558     llvm_unreachable("Unexpected spill load/store size!");
1559   }
1560 
1561   if (HasVAddr)
1562     LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1563   else if (UseST)
1564     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1565 
1566   return LoadStoreOp;
1567 }
1568 
buildSpillLoadStore(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,unsigned LoadStoreOp,int Index,Register ValueReg,bool IsKill,MCRegister ScratchOffsetReg,int64_t InstOffset,MachineMemOperand * MMO,RegScavenger * RS,LiveRegUnits * LiveUnits) const1569 void SIRegisterInfo::buildSpillLoadStore(
1570     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
1571     unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1572     MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1573     RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1574   assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1575 
1576   MachineFunction *MF = MBB.getParent();
1577   const SIInstrInfo *TII = ST.getInstrInfo();
1578   const MachineFrameInfo &MFI = MF->getFrameInfo();
1579   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1580 
1581   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1582   bool IsStore = Desc->mayStore();
1583   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1584   bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1585 
1586   bool CanClobberSCC = false;
1587   bool Scavenged = false;
1588   MCRegister SOffset = ScratchOffsetReg;
1589 
1590   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1591   // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1592   const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1593   const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1594 
1595   // Always use 4 byte operations for AGPRs because we need to scavenge
1596   // a temporary VGPR.
1597   // If we're using a block operation, the element should be the whole block.
1598   unsigned EltSize = IsBlock               ? RegWidth
1599                      : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1600                                            : 4u;
1601   unsigned NumSubRegs = RegWidth / EltSize;
1602   unsigned Size = NumSubRegs * EltSize;
1603   unsigned RemSize = RegWidth - Size;
1604   unsigned NumRemSubRegs = RemSize ? 1 : 0;
1605   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1606   int64_t MaterializedOffset = Offset;
1607 
1608   int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1609   int64_t ScratchOffsetRegDelta = 0;
1610 
1611   if (IsFlat && EltSize > 4) {
1612     LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1613     Desc = &TII->get(LoadStoreOp);
1614   }
1615 
1616   Align Alignment = MFI.getObjectAlign(Index);
1617   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1618 
1619   assert((IsFlat || ((Offset % EltSize) == 0)) &&
1620          "unexpected VGPR spill offset");
1621 
1622   // Track a VGPR to use for a constant offset we need to materialize.
1623   Register TmpOffsetVGPR;
1624 
1625   // Track a VGPR to use as an intermediate value.
1626   Register TmpIntermediateVGPR;
1627   bool UseVGPROffset = false;
1628 
1629   // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1630   // combination.
1631   auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1632                                 int64_t VOffset) {
1633     // We are using a VGPR offset
1634     if (IsFlat && SGPRBase) {
1635       // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1636       // SGPR, so perform the add as vector.
1637       // We don't need a base SGPR in the kernel.
1638 
1639       if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1640         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1641           .addReg(SGPRBase)
1642           .addImm(VOffset)
1643           .addImm(0); // clamp
1644       } else {
1645         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1646           .addReg(SGPRBase);
1647         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1648           .addImm(VOffset)
1649           .addReg(TmpOffsetVGPR);
1650       }
1651     } else {
1652       assert(TmpOffsetVGPR);
1653       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1654         .addImm(VOffset);
1655     }
1656   };
1657 
1658   bool IsOffsetLegal =
1659       IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1660                                       SIInstrFlags::FlatScratch)
1661              : TII->isLegalMUBUFImmOffset(MaxOffset);
1662   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1663     SOffset = MCRegister();
1664 
1665     // We don't have access to the register scavenger if this function is called
1666     // during  PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1667     // TODO: Clobbering SCC is not necessary for scratch instructions in the
1668     // entry.
1669     if (RS) {
1670       SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1671 
1672       // Piggy back on the liveness scan we just did see if SCC is dead.
1673       CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1674     } else if (LiveUnits) {
1675       CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1676       for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1677         if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1678           SOffset = Reg;
1679           break;
1680         }
1681       }
1682     }
1683 
1684     if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1685       SOffset = Register();
1686 
1687     if (!SOffset) {
1688       UseVGPROffset = true;
1689 
1690       if (RS) {
1691         TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1692       } else {
1693         assert(LiveUnits);
1694         for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1695           if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1696             TmpOffsetVGPR = Reg;
1697             break;
1698           }
1699         }
1700       }
1701 
1702       assert(TmpOffsetVGPR);
1703     } else if (!SOffset && CanClobberSCC) {
1704       // There are no free SGPRs, and since we are in the process of spilling
1705       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
1706       // on SI/CI and on VI it is true until we implement spilling using scalar
1707       // stores), we have no way to free up an SGPR.  Our solution here is to
1708       // add the offset directly to the ScratchOffset or StackPtrOffset
1709       // register, and then subtract the offset after the spill to return the
1710       // register to it's original value.
1711 
1712       // TODO: If we don't have to do an emergency stack slot spill, converting
1713       // to use the VGPR offset is fewer instructions.
1714       if (!ScratchOffsetReg)
1715         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1716       SOffset = ScratchOffsetReg;
1717       ScratchOffsetRegDelta = Offset;
1718     } else {
1719       Scavenged = true;
1720     }
1721 
1722     // We currently only support spilling VGPRs to EltSize boundaries, meaning
1723     // we can simplify the adjustment of Offset here to just scale with
1724     // WavefrontSize.
1725     if (!IsFlat && !UseVGPROffset)
1726       Offset *= ST.getWavefrontSize();
1727 
1728     if (!UseVGPROffset && !SOffset)
1729       report_fatal_error("could not scavenge SGPR to spill in entry function");
1730 
1731     if (UseVGPROffset) {
1732       // We are using a VGPR offset
1733       MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1734     } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1735       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1736     } else {
1737       assert(Offset != 0);
1738       auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1739           .addReg(ScratchOffsetReg)
1740           .addImm(Offset);
1741       Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1742     }
1743 
1744     Offset = 0;
1745   }
1746 
1747   if (IsFlat && SOffset == AMDGPU::NoRegister) {
1748     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1749            && "Unexpected vaddr for flat scratch with a FI operand");
1750 
1751     if (UseVGPROffset) {
1752       LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1753     } else {
1754       assert(ST.hasFlatScratchSTMode());
1755       assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1756       LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1757     }
1758 
1759     Desc = &TII->get(LoadStoreOp);
1760   }
1761 
1762   for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1763        ++i, RegOffset += EltSize) {
1764     if (i == NumSubRegs) {
1765       EltSize = RemSize;
1766       LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1767     }
1768     Desc = &TII->get(LoadStoreOp);
1769 
1770     if (!IsFlat && UseVGPROffset) {
1771       int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1772                                    : getOffenMUBUFLoad(LoadStoreOp);
1773       Desc = &TII->get(NewLoadStoreOp);
1774     }
1775 
1776     if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1777       // If we are spilling an AGPR beyond the range of the memory instruction
1778       // offset and need to use a VGPR offset, we ideally have at least 2
1779       // scratch VGPRs. If we don't have a second free VGPR without spilling,
1780       // recycle the VGPR used for the offset which requires resetting after
1781       // each subregister.
1782 
1783       MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1784     }
1785 
1786     unsigned NumRegs = EltSize / 4;
1787     Register SubReg = e == 1
1788             ? ValueReg
1789             : Register(getSubReg(ValueReg,
1790                                  getSubRegFromChannel(RegOffset / 4, NumRegs)));
1791 
1792     unsigned SOffsetRegState = 0;
1793     unsigned SrcDstRegState = getDefRegState(!IsStore);
1794     const bool IsLastSubReg = i + 1 == e;
1795     const bool IsFirstSubReg = i == 0;
1796     if (IsLastSubReg) {
1797       SOffsetRegState |= getKillRegState(Scavenged);
1798       // The last implicit use carries the "Kill" flag.
1799       SrcDstRegState |= getKillRegState(IsKill);
1800     }
1801 
1802     // Make sure the whole register is defined if there are undef components by
1803     // adding an implicit def of the super-reg on the first instruction.
1804     bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1805     bool NeedSuperRegImpOperand = e > 1;
1806 
1807     // Remaining element size to spill into memory after some parts of it
1808     // spilled into either AGPRs or VGPRs.
1809     unsigned RemEltSize = EltSize;
1810 
1811     // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1812     // starting from the last lane. In case if a register cannot be completely
1813     // spilled into another register that will ensure its alignment does not
1814     // change. For targets with VGPR alignment requirement this is important
1815     // in case of flat scratch usage as we might get a scratch_load or
1816     // scratch_store of an unaligned register otherwise.
1817     for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1818              LaneE = RegOffset / 4;
1819          Lane >= LaneE; --Lane) {
1820       bool IsSubReg = e > 1 || EltSize > 4;
1821       Register Sub = IsSubReg
1822              ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1823              : ValueReg;
1824       auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1825       if (!MIB.getInstr())
1826         break;
1827       if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1828         MIB.addReg(ValueReg, RegState::ImplicitDefine);
1829         NeedSuperRegDef = false;
1830       }
1831       if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1832         NeedSuperRegImpOperand = true;
1833         unsigned State = SrcDstRegState;
1834         if (!IsLastSubReg || (Lane != LaneE))
1835           State &= ~RegState::Kill;
1836         if (!IsFirstSubReg || (Lane != LaneS))
1837           State &= ~RegState::Define;
1838         MIB.addReg(ValueReg, RegState::Implicit | State);
1839       }
1840       RemEltSize -= 4;
1841     }
1842 
1843     if (!RemEltSize) // Fully spilled into AGPRs.
1844       continue;
1845 
1846     if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1847       assert(IsFlat && EltSize > 4);
1848 
1849       unsigned NumRegs = RemEltSize / 4;
1850       SubReg = Register(getSubReg(ValueReg,
1851                         getSubRegFromChannel(RegOffset / 4, NumRegs)));
1852       unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1853       Desc = &TII->get(Opc);
1854     }
1855 
1856     unsigned FinalReg = SubReg;
1857 
1858     if (IsAGPR) {
1859       assert(EltSize == 4);
1860 
1861       if (!TmpIntermediateVGPR) {
1862         TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1863         assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1864       }
1865       if (IsStore) {
1866         auto AccRead = BuildMI(MBB, MI, DL,
1867                                TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1868                                TmpIntermediateVGPR)
1869                            .addReg(SubReg, getKillRegState(IsKill));
1870         if (NeedSuperRegDef)
1871           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1872         if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1873           AccRead.addReg(ValueReg, RegState::Implicit);
1874         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1875       }
1876       SubReg = TmpIntermediateVGPR;
1877     } else if (UseVGPROffset) {
1878       if (!TmpOffsetVGPR) {
1879         TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1880                                                       MI, false, 0);
1881         RS->setRegUsed(TmpOffsetVGPR);
1882       }
1883     }
1884 
1885     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1886     MachineMemOperand *NewMMO =
1887         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1888                                  commonAlignment(Alignment, RegOffset));
1889 
1890     auto MIB =
1891         BuildMI(MBB, MI, DL, *Desc)
1892             .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1893 
1894     if (UseVGPROffset) {
1895       // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1896       // intermediate accvgpr_write.
1897       MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1898     }
1899 
1900     if (!IsFlat)
1901       MIB.addReg(FuncInfo->getScratchRSrcReg());
1902 
1903     if (SOffset == AMDGPU::NoRegister) {
1904       if (!IsFlat) {
1905         if (UseVGPROffset && ScratchOffsetReg) {
1906           MIB.addReg(ScratchOffsetReg);
1907         } else {
1908           assert(FuncInfo->isBottomOfStack());
1909           MIB.addImm(0);
1910         }
1911       }
1912     } else {
1913       MIB.addReg(SOffset, SOffsetRegState);
1914     }
1915 
1916     MIB.addImm(Offset + RegOffset);
1917 
1918     bool LastUse = MMO->getFlags() & MOLastUse;
1919     MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1920 
1921     if (!IsFlat)
1922       MIB.addImm(0); // swz
1923     MIB.addMemOperand(NewMMO);
1924 
1925     if (!IsAGPR && NeedSuperRegDef)
1926       MIB.addReg(ValueReg, RegState::ImplicitDefine);
1927 
1928     if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1929       MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1930                     FinalReg)
1931                 .addReg(TmpIntermediateVGPR, RegState::Kill);
1932       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1933     }
1934 
1935     bool IsSrcDstDef = SrcDstRegState & RegState::Define;
1936     if (NeedSuperRegImpOperand &&
1937         (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef)))
1938       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1939 
1940     // The epilog restore of a wwm-scratch register can cause undesired
1941     // optimization during machine-cp post PrologEpilogInserter if the same
1942     // register was assigned for return value ABI lowering with a COPY
1943     // instruction. As given below, with the epilog reload, the earlier COPY
1944     // appeared to be dead during machine-cp.
1945     // ...
1946     // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1947     // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1948     // ...
1949     // Epilog block:
1950     // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1951     // ...
1952     // WWM spill restore to preserve the inactive lanes of v0.
1953     // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1954     // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1955     // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1956     // ...
1957     // SI_RETURN implicit $vgpr0
1958     // ...
1959     // To fix it, mark the same reg as a tied op for such restore instructions
1960     // so that it marks a usage for the preceding COPY.
1961     if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1962         MI->readsRegister(SubReg, this)) {
1963       MIB.addReg(SubReg, RegState::Implicit);
1964       MIB->tieOperands(0, MIB->getNumOperands() - 1);
1965     }
1966 
1967     //  If we're building a block load, we should add artificial uses for the
1968     //  CSR VGPRs that are *not* being transferred. This is because liveness
1969     //  analysis is not aware of the mask, so we need to somehow inform it that
1970     //  those registers are not available before the load and they should not be
1971     //  scavenged.
1972     if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1973       addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1974   }
1975 
1976   if (ScratchOffsetRegDelta != 0) {
1977     // Subtract the offset we added to the ScratchOffset register.
1978     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1979         .addReg(SOffset)
1980         .addImm(-ScratchOffsetRegDelta);
1981   }
1982 }
1983 
addImplicitUsesForBlockCSRLoad(MachineInstrBuilder & MIB,Register BlockReg) const1984 void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
1985                                                     Register BlockReg) const {
1986   const MachineFunction *MF = MIB->getParent()->getParent();
1987   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1988   uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1989   Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1990   for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1991     if (!(Mask & (1 << RegOffset)) &&
1992         isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1993       MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1994 }
1995 
buildVGPRSpillLoadStore(SGPRSpillBuilder & SB,int Index,int Offset,bool IsLoad,bool IsKill) const1996 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
1997                                              int Offset, bool IsLoad,
1998                                              bool IsKill) const {
1999   // Load/store VGPR
2000   MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
2001   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
2002 
2003   Register FrameReg =
2004       FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
2005           ? getBaseRegister()
2006           : getFrameRegister(SB.MF);
2007 
2008   Align Alignment = FrameInfo.getObjectAlign(Index);
2009   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
2010   MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
2011       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
2012       SB.EltSize, Alignment);
2013 
2014   if (IsLoad) {
2015     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2016                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2017     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
2018                         FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2019   } else {
2020     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2021                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2022     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
2023                         FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2024     // This only ever adds one VGPR spill
2025     SB.MFI.addToSpilledVGPRs(1);
2026   }
2027 }
2028 
spillSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,SlotIndexes * Indexes,LiveIntervals * LIS,bool OnlyToVGPR,bool SpillToPhysVGPRLane) const2029 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
2030                                RegScavenger *RS, SlotIndexes *Indexes,
2031                                LiveIntervals *LIS, bool OnlyToVGPR,
2032                                bool SpillToPhysVGPRLane) const {
2033   assert(!MI->getOperand(0).isUndef() &&
2034          "undef spill should have been deleted earlier");
2035 
2036   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2037 
2038   ArrayRef<SpilledReg> VGPRSpills =
2039       SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2040                           : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
2041   bool SpillToVGPR = !VGPRSpills.empty();
2042   if (OnlyToVGPR && !SpillToVGPR)
2043     return false;
2044 
2045   assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2046                          SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2047 
2048   if (SpillToVGPR) {
2049 
2050     // Since stack slot coloring pass is trying to optimize SGPR spills,
2051     // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2052     // spills of different sizes. This accounts for number of VGPR lanes alloted
2053     // equal to the largest SGPR being spilled in them.
2054     assert(SB.NumSubRegs <= VGPRSpills.size() &&
2055            "Num of SGPRs spilled should be less than or equal to num of "
2056            "the VGPR lanes.");
2057 
2058     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2059       Register SubReg =
2060           SB.NumSubRegs == 1
2061               ? SB.SuperReg
2062               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2063       SpilledReg Spill = VGPRSpills[i];
2064 
2065       bool IsFirstSubreg = i == 0;
2066       bool IsLastSubreg = i == SB.NumSubRegs - 1;
2067       bool UseKill = SB.IsKill && IsLastSubreg;
2068 
2069 
2070       // Mark the "old value of vgpr" input undef only if this is the first sgpr
2071       // spill to this specific vgpr in the first basic block.
2072       auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2073                          SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2074                      .addReg(SubReg, getKillRegState(UseKill))
2075                      .addImm(Spill.Lane)
2076                      .addReg(Spill.VGPR);
2077       if (Indexes) {
2078         if (IsFirstSubreg)
2079           Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2080         else
2081           Indexes->insertMachineInstrInMaps(*MIB);
2082       }
2083 
2084       if (IsFirstSubreg && SB.NumSubRegs > 1) {
2085         // We may be spilling a super-register which is only partially defined,
2086         // and need to ensure later spills think the value is defined.
2087         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2088       }
2089 
2090       if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2091         MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2092 
2093       // FIXME: Since this spills to another register instead of an actual
2094       // frame index, we should delete the frame index when all references to
2095       // it are fixed.
2096     }
2097   } else {
2098     SB.prepare();
2099 
2100     // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2101     unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2102 
2103     // Per VGPR helper data
2104     auto PVD = SB.getPerVGPRData();
2105 
2106     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2107       unsigned TmpVGPRFlags = RegState::Undef;
2108 
2109       // Write sub registers into the VGPR
2110       for (unsigned i = Offset * PVD.PerVGPR,
2111                     e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2112            i < e; ++i) {
2113         Register SubReg =
2114             SB.NumSubRegs == 1
2115                 ? SB.SuperReg
2116                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2117 
2118         MachineInstrBuilder WriteLane =
2119             BuildMI(*SB.MBB, MI, SB.DL,
2120                     SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2121                 .addReg(SubReg, SubKillState)
2122                 .addImm(i % PVD.PerVGPR)
2123                 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2124         TmpVGPRFlags = 0;
2125 
2126         if (Indexes) {
2127           if (i == 0)
2128             Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2129           else
2130             Indexes->insertMachineInstrInMaps(*WriteLane);
2131         }
2132 
2133         // There could be undef components of a spilled super register.
2134         // TODO: Can we detect this and skip the spill?
2135         if (SB.NumSubRegs > 1) {
2136           // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2137           unsigned SuperKillState = 0;
2138           if (i + 1 == SB.NumSubRegs)
2139             SuperKillState |= getKillRegState(SB.IsKill);
2140           WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2141         }
2142       }
2143 
2144       // Write out VGPR
2145       SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2146     }
2147 
2148     SB.restore();
2149   }
2150 
2151   MI->eraseFromParent();
2152   SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
2153 
2154   if (LIS)
2155     LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
2156 
2157   return true;
2158 }
2159 
restoreSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,SlotIndexes * Indexes,LiveIntervals * LIS,bool OnlyToVGPR,bool SpillToPhysVGPRLane) const2160 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
2161                                  RegScavenger *RS, SlotIndexes *Indexes,
2162                                  LiveIntervals *LIS, bool OnlyToVGPR,
2163                                  bool SpillToPhysVGPRLane) const {
2164   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2165 
2166   ArrayRef<SpilledReg> VGPRSpills =
2167       SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2168                           : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
2169   bool SpillToVGPR = !VGPRSpills.empty();
2170   if (OnlyToVGPR && !SpillToVGPR)
2171     return false;
2172 
2173   if (SpillToVGPR) {
2174     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2175       Register SubReg =
2176           SB.NumSubRegs == 1
2177               ? SB.SuperReg
2178               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2179 
2180       SpilledReg Spill = VGPRSpills[i];
2181       auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2182                          SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2183                      .addReg(Spill.VGPR)
2184                      .addImm(Spill.Lane);
2185       if (SB.NumSubRegs > 1 && i == 0)
2186         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2187       if (Indexes) {
2188         if (i == e - 1)
2189           Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2190         else
2191           Indexes->insertMachineInstrInMaps(*MIB);
2192       }
2193     }
2194   } else {
2195     SB.prepare();
2196 
2197     // Per VGPR helper data
2198     auto PVD = SB.getPerVGPRData();
2199 
2200     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2201       // Load in VGPR data
2202       SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2203 
2204       // Unpack lanes
2205       for (unsigned i = Offset * PVD.PerVGPR,
2206                     e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2207            i < e; ++i) {
2208         Register SubReg =
2209             SB.NumSubRegs == 1
2210                 ? SB.SuperReg
2211                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2212 
2213         bool LastSubReg = (i + 1 == e);
2214         auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2215                            SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2216                        .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2217                        .addImm(i);
2218         if (SB.NumSubRegs > 1 && i == 0)
2219           MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2220         if (Indexes) {
2221           if (i == e - 1)
2222             Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2223           else
2224             Indexes->insertMachineInstrInMaps(*MIB);
2225         }
2226       }
2227     }
2228 
2229     SB.restore();
2230   }
2231 
2232   MI->eraseFromParent();
2233 
2234   if (LIS)
2235     LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
2236 
2237   return true;
2238 }
2239 
spillEmergencySGPR(MachineBasicBlock::iterator MI,MachineBasicBlock & RestoreMBB,Register SGPR,RegScavenger * RS) const2240 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
2241                                         MachineBasicBlock &RestoreMBB,
2242                                         Register SGPR, RegScavenger *RS) const {
2243   SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2244                       RS);
2245   SB.prepare();
2246   // Generate the spill of SGPR to SB.TmpVGPR.
2247   unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2248   auto PVD = SB.getPerVGPRData();
2249   for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2250     unsigned TmpVGPRFlags = RegState::Undef;
2251     // Write sub registers into the VGPR
2252     for (unsigned i = Offset * PVD.PerVGPR,
2253                   e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2254          i < e; ++i) {
2255       Register SubReg =
2256           SB.NumSubRegs == 1
2257               ? SB.SuperReg
2258               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2259 
2260       MachineInstrBuilder WriteLane =
2261           BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2262                   SB.TmpVGPR)
2263               .addReg(SubReg, SubKillState)
2264               .addImm(i % PVD.PerVGPR)
2265               .addReg(SB.TmpVGPR, TmpVGPRFlags);
2266       TmpVGPRFlags = 0;
2267       // There could be undef components of a spilled super register.
2268       // TODO: Can we detect this and skip the spill?
2269       if (SB.NumSubRegs > 1) {
2270         // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2271         unsigned SuperKillState = 0;
2272         if (i + 1 == SB.NumSubRegs)
2273           SuperKillState |= getKillRegState(SB.IsKill);
2274         WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2275       }
2276     }
2277     // Don't need to write VGPR out.
2278   }
2279 
2280   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2281 
2282   // Restore clobbered registers in the specified restore block.
2283   MI = RestoreMBB.end();
2284   SB.setMI(&RestoreMBB, MI);
2285   // Generate the restore of SGPR from SB.TmpVGPR.
2286   for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2287     // Don't need to load VGPR in.
2288     // Unpack lanes
2289     for (unsigned i = Offset * PVD.PerVGPR,
2290                   e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2291          i < e; ++i) {
2292       Register SubReg =
2293           SB.NumSubRegs == 1
2294               ? SB.SuperReg
2295               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2296       MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
2297       bool LastSubReg = (i + 1 == e);
2298       auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2299                          SubReg)
2300                      .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2301                      .addImm(i);
2302       if (SB.NumSubRegs > 1 && i == 0)
2303         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2304     }
2305   }
2306   SB.restore();
2307 
2308   SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
2309   return false;
2310 }
2311 
2312 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2313 /// a VGPR and the stack slot can be safely eliminated when all other users are
2314 /// handled.
eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,int FI,RegScavenger * RS,SlotIndexes * Indexes,LiveIntervals * LIS,bool SpillToPhysVGPRLane) const2315 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
2316     MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
2317     SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2318   switch (MI->getOpcode()) {
2319   case AMDGPU::SI_SPILL_S1024_SAVE:
2320   case AMDGPU::SI_SPILL_S512_SAVE:
2321   case AMDGPU::SI_SPILL_S384_SAVE:
2322   case AMDGPU::SI_SPILL_S352_SAVE:
2323   case AMDGPU::SI_SPILL_S320_SAVE:
2324   case AMDGPU::SI_SPILL_S288_SAVE:
2325   case AMDGPU::SI_SPILL_S256_SAVE:
2326   case AMDGPU::SI_SPILL_S224_SAVE:
2327   case AMDGPU::SI_SPILL_S192_SAVE:
2328   case AMDGPU::SI_SPILL_S160_SAVE:
2329   case AMDGPU::SI_SPILL_S128_SAVE:
2330   case AMDGPU::SI_SPILL_S96_SAVE:
2331   case AMDGPU::SI_SPILL_S64_SAVE:
2332   case AMDGPU::SI_SPILL_S32_SAVE:
2333     return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2334   case AMDGPU::SI_SPILL_S1024_RESTORE:
2335   case AMDGPU::SI_SPILL_S512_RESTORE:
2336   case AMDGPU::SI_SPILL_S384_RESTORE:
2337   case AMDGPU::SI_SPILL_S352_RESTORE:
2338   case AMDGPU::SI_SPILL_S320_RESTORE:
2339   case AMDGPU::SI_SPILL_S288_RESTORE:
2340   case AMDGPU::SI_SPILL_S256_RESTORE:
2341   case AMDGPU::SI_SPILL_S224_RESTORE:
2342   case AMDGPU::SI_SPILL_S192_RESTORE:
2343   case AMDGPU::SI_SPILL_S160_RESTORE:
2344   case AMDGPU::SI_SPILL_S128_RESTORE:
2345   case AMDGPU::SI_SPILL_S96_RESTORE:
2346   case AMDGPU::SI_SPILL_S64_RESTORE:
2347   case AMDGPU::SI_SPILL_S32_RESTORE:
2348     return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2349   default:
2350     llvm_unreachable("not an SGPR spill instruction");
2351   }
2352 }
2353 
eliminateFrameIndex(MachineBasicBlock::iterator MI,int SPAdj,unsigned FIOperandNum,RegScavenger * RS) const2354 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2355                                         int SPAdj, unsigned FIOperandNum,
2356                                         RegScavenger *RS) const {
2357   MachineFunction *MF = MI->getParent()->getParent();
2358   MachineBasicBlock *MBB = MI->getParent();
2359   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2360   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2361   const SIInstrInfo *TII = ST.getInstrInfo();
2362   const DebugLoc &DL = MI->getDebugLoc();
2363 
2364   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2365 
2366   assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
2367          "unreserved scratch RSRC register");
2368 
2369   MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2370   int Index = MI->getOperand(FIOperandNum).getIndex();
2371 
2372   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2373                           ? getBaseRegister()
2374                           : getFrameRegister(*MF);
2375 
2376   switch (MI->getOpcode()) {
2377     // SGPR register spill
2378     case AMDGPU::SI_SPILL_S1024_SAVE:
2379     case AMDGPU::SI_SPILL_S512_SAVE:
2380     case AMDGPU::SI_SPILL_S384_SAVE:
2381     case AMDGPU::SI_SPILL_S352_SAVE:
2382     case AMDGPU::SI_SPILL_S320_SAVE:
2383     case AMDGPU::SI_SPILL_S288_SAVE:
2384     case AMDGPU::SI_SPILL_S256_SAVE:
2385     case AMDGPU::SI_SPILL_S224_SAVE:
2386     case AMDGPU::SI_SPILL_S192_SAVE:
2387     case AMDGPU::SI_SPILL_S160_SAVE:
2388     case AMDGPU::SI_SPILL_S128_SAVE:
2389     case AMDGPU::SI_SPILL_S96_SAVE:
2390     case AMDGPU::SI_SPILL_S64_SAVE:
2391     case AMDGPU::SI_SPILL_S32_SAVE: {
2392       return spillSGPR(MI, Index, RS);
2393     }
2394 
2395     // SGPR register restore
2396     case AMDGPU::SI_SPILL_S1024_RESTORE:
2397     case AMDGPU::SI_SPILL_S512_RESTORE:
2398     case AMDGPU::SI_SPILL_S384_RESTORE:
2399     case AMDGPU::SI_SPILL_S352_RESTORE:
2400     case AMDGPU::SI_SPILL_S320_RESTORE:
2401     case AMDGPU::SI_SPILL_S288_RESTORE:
2402     case AMDGPU::SI_SPILL_S256_RESTORE:
2403     case AMDGPU::SI_SPILL_S224_RESTORE:
2404     case AMDGPU::SI_SPILL_S192_RESTORE:
2405     case AMDGPU::SI_SPILL_S160_RESTORE:
2406     case AMDGPU::SI_SPILL_S128_RESTORE:
2407     case AMDGPU::SI_SPILL_S96_RESTORE:
2408     case AMDGPU::SI_SPILL_S64_RESTORE:
2409     case AMDGPU::SI_SPILL_S32_RESTORE: {
2410       return restoreSGPR(MI, Index, RS);
2411     }
2412 
2413     // VGPR register spill
2414     case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2415       // Put mask into M0.
2416       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2417               AMDGPU::M0)
2418           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2419       LLVM_FALLTHROUGH;
2420     }
2421     case AMDGPU::SI_SPILL_V1024_SAVE:
2422     case AMDGPU::SI_SPILL_V512_SAVE:
2423     case AMDGPU::SI_SPILL_V384_SAVE:
2424     case AMDGPU::SI_SPILL_V352_SAVE:
2425     case AMDGPU::SI_SPILL_V320_SAVE:
2426     case AMDGPU::SI_SPILL_V288_SAVE:
2427     case AMDGPU::SI_SPILL_V256_SAVE:
2428     case AMDGPU::SI_SPILL_V224_SAVE:
2429     case AMDGPU::SI_SPILL_V192_SAVE:
2430     case AMDGPU::SI_SPILL_V160_SAVE:
2431     case AMDGPU::SI_SPILL_V128_SAVE:
2432     case AMDGPU::SI_SPILL_V96_SAVE:
2433     case AMDGPU::SI_SPILL_V64_SAVE:
2434     case AMDGPU::SI_SPILL_V32_SAVE:
2435     case AMDGPU::SI_SPILL_V16_SAVE:
2436     case AMDGPU::SI_SPILL_A1024_SAVE:
2437     case AMDGPU::SI_SPILL_A512_SAVE:
2438     case AMDGPU::SI_SPILL_A384_SAVE:
2439     case AMDGPU::SI_SPILL_A352_SAVE:
2440     case AMDGPU::SI_SPILL_A320_SAVE:
2441     case AMDGPU::SI_SPILL_A288_SAVE:
2442     case AMDGPU::SI_SPILL_A256_SAVE:
2443     case AMDGPU::SI_SPILL_A224_SAVE:
2444     case AMDGPU::SI_SPILL_A192_SAVE:
2445     case AMDGPU::SI_SPILL_A160_SAVE:
2446     case AMDGPU::SI_SPILL_A128_SAVE:
2447     case AMDGPU::SI_SPILL_A96_SAVE:
2448     case AMDGPU::SI_SPILL_A64_SAVE:
2449     case AMDGPU::SI_SPILL_A32_SAVE:
2450     case AMDGPU::SI_SPILL_AV1024_SAVE:
2451     case AMDGPU::SI_SPILL_AV512_SAVE:
2452     case AMDGPU::SI_SPILL_AV384_SAVE:
2453     case AMDGPU::SI_SPILL_AV352_SAVE:
2454     case AMDGPU::SI_SPILL_AV320_SAVE:
2455     case AMDGPU::SI_SPILL_AV288_SAVE:
2456     case AMDGPU::SI_SPILL_AV256_SAVE:
2457     case AMDGPU::SI_SPILL_AV224_SAVE:
2458     case AMDGPU::SI_SPILL_AV192_SAVE:
2459     case AMDGPU::SI_SPILL_AV160_SAVE:
2460     case AMDGPU::SI_SPILL_AV128_SAVE:
2461     case AMDGPU::SI_SPILL_AV96_SAVE:
2462     case AMDGPU::SI_SPILL_AV64_SAVE:
2463     case AMDGPU::SI_SPILL_AV32_SAVE:
2464     case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2465     case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2466       const MachineOperand *VData = TII->getNamedOperand(*MI,
2467                                                          AMDGPU::OpName::vdata);
2468       if (VData->isUndef()) {
2469         MI->eraseFromParent();
2470         return true;
2471       }
2472 
2473       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2474              MFI->getStackPtrOffsetReg());
2475 
2476       unsigned Opc;
2477       if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2478         assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2479         Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2480       } else {
2481         Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2482                   ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2483               : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2484                                        : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2485       }
2486 
2487       auto *MBB = MI->getParent();
2488       bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2489       if (IsWWMRegSpill) {
2490         TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2491                                   RS->isRegUsed(AMDGPU::SCC));
2492       }
2493       buildSpillLoadStore(
2494           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2495           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2496           *MI->memoperands_begin(), RS);
2497       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(*MI, TII));
2498       if (IsWWMRegSpill)
2499         TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2500 
2501       MI->eraseFromParent();
2502       return true;
2503     }
2504     case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2505       // Put mask into M0.
2506       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2507               AMDGPU::M0)
2508           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2509       LLVM_FALLTHROUGH;
2510     }
2511     case AMDGPU::SI_SPILL_V16_RESTORE:
2512     case AMDGPU::SI_SPILL_V32_RESTORE:
2513     case AMDGPU::SI_SPILL_V64_RESTORE:
2514     case AMDGPU::SI_SPILL_V96_RESTORE:
2515     case AMDGPU::SI_SPILL_V128_RESTORE:
2516     case AMDGPU::SI_SPILL_V160_RESTORE:
2517     case AMDGPU::SI_SPILL_V192_RESTORE:
2518     case AMDGPU::SI_SPILL_V224_RESTORE:
2519     case AMDGPU::SI_SPILL_V256_RESTORE:
2520     case AMDGPU::SI_SPILL_V288_RESTORE:
2521     case AMDGPU::SI_SPILL_V320_RESTORE:
2522     case AMDGPU::SI_SPILL_V352_RESTORE:
2523     case AMDGPU::SI_SPILL_V384_RESTORE:
2524     case AMDGPU::SI_SPILL_V512_RESTORE:
2525     case AMDGPU::SI_SPILL_V1024_RESTORE:
2526     case AMDGPU::SI_SPILL_A32_RESTORE:
2527     case AMDGPU::SI_SPILL_A64_RESTORE:
2528     case AMDGPU::SI_SPILL_A96_RESTORE:
2529     case AMDGPU::SI_SPILL_A128_RESTORE:
2530     case AMDGPU::SI_SPILL_A160_RESTORE:
2531     case AMDGPU::SI_SPILL_A192_RESTORE:
2532     case AMDGPU::SI_SPILL_A224_RESTORE:
2533     case AMDGPU::SI_SPILL_A256_RESTORE:
2534     case AMDGPU::SI_SPILL_A288_RESTORE:
2535     case AMDGPU::SI_SPILL_A320_RESTORE:
2536     case AMDGPU::SI_SPILL_A352_RESTORE:
2537     case AMDGPU::SI_SPILL_A384_RESTORE:
2538     case AMDGPU::SI_SPILL_A512_RESTORE:
2539     case AMDGPU::SI_SPILL_A1024_RESTORE:
2540     case AMDGPU::SI_SPILL_AV32_RESTORE:
2541     case AMDGPU::SI_SPILL_AV64_RESTORE:
2542     case AMDGPU::SI_SPILL_AV96_RESTORE:
2543     case AMDGPU::SI_SPILL_AV128_RESTORE:
2544     case AMDGPU::SI_SPILL_AV160_RESTORE:
2545     case AMDGPU::SI_SPILL_AV192_RESTORE:
2546     case AMDGPU::SI_SPILL_AV224_RESTORE:
2547     case AMDGPU::SI_SPILL_AV256_RESTORE:
2548     case AMDGPU::SI_SPILL_AV288_RESTORE:
2549     case AMDGPU::SI_SPILL_AV320_RESTORE:
2550     case AMDGPU::SI_SPILL_AV352_RESTORE:
2551     case AMDGPU::SI_SPILL_AV384_RESTORE:
2552     case AMDGPU::SI_SPILL_AV512_RESTORE:
2553     case AMDGPU::SI_SPILL_AV1024_RESTORE:
2554     case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2555     case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2556       const MachineOperand *VData = TII->getNamedOperand(*MI,
2557                                                          AMDGPU::OpName::vdata);
2558       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2559              MFI->getStackPtrOffsetReg());
2560 
2561       unsigned Opc;
2562       if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2563         assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2564         Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
2565       } else {
2566         Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2567                   ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2568               : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2569                                        : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2570       }
2571 
2572       auto *MBB = MI->getParent();
2573       bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2574       if (IsWWMRegSpill) {
2575         TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2576                                    RS->isRegUsed(AMDGPU::SCC));
2577       }
2578 
2579       buildSpillLoadStore(
2580           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2581           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2582           *MI->memoperands_begin(), RS);
2583 
2584       if (IsWWMRegSpill)
2585         TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2586 
2587       MI->eraseFromParent();
2588       return true;
2589     }
2590     case AMDGPU::V_ADD_U32_e32:
2591     case AMDGPU::V_ADD_U32_e64:
2592     case AMDGPU::V_ADD_CO_U32_e32:
2593     case AMDGPU::V_ADD_CO_U32_e64: {
2594       // TODO: Handle sub, and, or.
2595       unsigned NumDefs = MI->getNumExplicitDefs();
2596       unsigned Src0Idx = NumDefs;
2597 
2598       bool HasClamp = false;
2599       MachineOperand *VCCOp = nullptr;
2600 
2601       switch (MI->getOpcode()) {
2602       case AMDGPU::V_ADD_U32_e32:
2603         break;
2604       case AMDGPU::V_ADD_U32_e64:
2605         HasClamp = MI->getOperand(3).getImm();
2606         break;
2607       case AMDGPU::V_ADD_CO_U32_e32:
2608         VCCOp = &MI->getOperand(3);
2609         break;
2610       case AMDGPU::V_ADD_CO_U32_e64:
2611         VCCOp = &MI->getOperand(1);
2612         HasClamp = MI->getOperand(4).getImm();
2613         break;
2614       default:
2615         break;
2616       }
2617       bool DeadVCC = !VCCOp || VCCOp->isDead();
2618       MachineOperand &DstOp = MI->getOperand(0);
2619       Register DstReg = DstOp.getReg();
2620 
2621       unsigned OtherOpIdx =
2622           FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2623       MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2624 
2625       unsigned Src1Idx = Src0Idx + 1;
2626       Register MaterializedReg = FrameReg;
2627       Register ScavengedVGPR;
2628 
2629       int64_t Offset = FrameInfo.getObjectOffset(Index);
2630       // For the non-immediate case, we could fall through to the default
2631       // handling, but we do an in-place update of the result register here to
2632       // avoid scavenging another register.
2633       if (OtherOp->isImm()) {
2634         int64_t TotalOffset = OtherOp->getImm() + Offset;
2635 
2636         if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2637             !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2638           // If we can't support a VOP3 literal in the VALU instruction, we
2639           // can't specially fold into the add.
2640           // TODO: Handle VOP3->VOP2 shrink to support the fold.
2641           break;
2642         }
2643 
2644         OtherOp->setImm(TotalOffset);
2645         Offset = 0;
2646       }
2647 
2648       if (FrameReg && !ST.enableFlatScratch()) {
2649         // We should just do an in-place update of the result register. However,
2650         // the value there may also be used by the add, in which case we need a
2651         // temporary register.
2652         //
2653         // FIXME: The scavenger is not finding the result register in the
2654         // common case where the add does not read the register.
2655 
2656         ScavengedVGPR = RS->scavengeRegisterBackwards(
2657             AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2658 
2659         // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2660         // shift.
2661         BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2662             .addDef(ScavengedVGPR, RegState::Renamable)
2663             .addImm(ST.getWavefrontSizeLog2())
2664             .addReg(FrameReg);
2665         MaterializedReg = ScavengedVGPR;
2666       }
2667 
2668       if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2669         if (ST.enableFlatScratch() &&
2670             !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2671           // We didn't need the shift above, so we have an SGPR for the frame
2672           // register, but may have a VGPR only operand.
2673           //
2674           // TODO: On gfx10+, we can easily change the opcode to the e64 version
2675           // and use the higher constant bus restriction to avoid this copy.
2676 
2677           if (!ScavengedVGPR) {
2678             ScavengedVGPR = RS->scavengeRegisterBackwards(
2679                 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2680                 /*SPAdj=*/0);
2681           }
2682 
2683           assert(ScavengedVGPR != DstReg);
2684 
2685           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2686               .addReg(MaterializedReg,
2687                       MaterializedReg != FrameReg ? RegState::Kill : 0);
2688           MaterializedReg = ScavengedVGPR;
2689         }
2690 
2691         // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2692         // is not live, we could use a scalar add + vector add instead of 2
2693         // vector adds.
2694         auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2695                           .addDef(DstReg, RegState::Renamable);
2696         if (NumDefs == 2)
2697           AddI32.add(MI->getOperand(1));
2698 
2699         unsigned MaterializedRegFlags =
2700             MaterializedReg != FrameReg ? RegState::Kill : 0;
2701 
2702         if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2703           // If we know we have a VGPR already, it's more likely the other
2704           // operand is a legal vsrc0.
2705           AddI32
2706             .add(*OtherOp)
2707             .addReg(MaterializedReg, MaterializedRegFlags);
2708         } else {
2709           // Commute operands to avoid violating VOP2 restrictions. This will
2710           // typically happen when using scratch.
2711           AddI32
2712             .addReg(MaterializedReg, MaterializedRegFlags)
2713             .add(*OtherOp);
2714         }
2715 
2716         if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2717             MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2718           AddI32.addImm(0); // clamp
2719 
2720         if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2721           AddI32.setOperandDead(3); // Dead vcc
2722 
2723         MaterializedReg = DstReg;
2724 
2725         OtherOp->ChangeToRegister(MaterializedReg, false);
2726         OtherOp->setIsKill(true);
2727         FIOp->ChangeToImmediate(Offset);
2728         Offset = 0;
2729       } else if (Offset != 0) {
2730         assert(!MaterializedReg);
2731         FIOp->ChangeToImmediate(Offset);
2732         Offset = 0;
2733       } else {
2734         if (DeadVCC && !HasClamp) {
2735           assert(Offset == 0);
2736 
2737           // TODO: Losing kills and implicit operands. Just mutate to copy and
2738           // let lowerCopy deal with it?
2739           if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2740             // Folded to an identity copy.
2741             MI->eraseFromParent();
2742             return true;
2743           }
2744 
2745           // The immediate value should be in OtherOp
2746           MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2747           MI->removeOperand(FIOperandNum);
2748 
2749           unsigned NumOps = MI->getNumOperands();
2750           for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2751             MI->removeOperand(I);
2752 
2753           if (NumDefs == 2)
2754             MI->removeOperand(1);
2755 
2756           // The code below can't deal with a mov.
2757           return true;
2758         }
2759 
2760         // This folded to a constant, but we have to keep the add around for
2761         // pointless implicit defs or clamp modifier.
2762         FIOp->ChangeToImmediate(0);
2763       }
2764 
2765       // Try to improve legality by commuting.
2766       if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2767         std::swap(FIOp, OtherOp);
2768         std::swap(FIOperandNum, OtherOpIdx);
2769       }
2770 
2771       // We need at most one mov to satisfy the operand constraints. Prefer to
2772       // move the FI operand first, as it may be a literal in a VOP3
2773       // instruction.
2774       for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2775         if (!TII->isOperandLegal(*MI, SrcIdx)) {
2776           // If commuting didn't make the operands legal, we need to materialize
2777           // in a register.
2778           // TODO: Can use SGPR on gfx10+ in some cases.
2779           if (!ScavengedVGPR) {
2780             ScavengedVGPR = RS->scavengeRegisterBackwards(
2781                 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2782                 /*SPAdj=*/0);
2783           }
2784 
2785           assert(ScavengedVGPR != DstReg);
2786 
2787           MachineOperand &Src = MI->getOperand(SrcIdx);
2788           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2789               .add(Src);
2790 
2791           Src.ChangeToRegister(ScavengedVGPR, false);
2792           Src.setIsKill(true);
2793           break;
2794         }
2795       }
2796 
2797       // Fold out add of 0 case that can appear in kernels.
2798       if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2799         if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2800           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2801         }
2802 
2803         MI->eraseFromParent();
2804       }
2805 
2806       return true;
2807     }
2808     case AMDGPU::S_ADD_I32:
2809     case AMDGPU::S_ADD_U32: {
2810       // TODO: Handle s_or_b32, s_and_b32.
2811       unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2812       MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2813 
2814       assert(FrameReg || MFI->isBottomOfStack());
2815 
2816       MachineOperand &DstOp = MI->getOperand(0);
2817       const DebugLoc &DL = MI->getDebugLoc();
2818       Register MaterializedReg = FrameReg;
2819 
2820       // Defend against live scc, which should never happen in practice.
2821       bool DeadSCC = MI->getOperand(3).isDead();
2822 
2823       Register TmpReg;
2824 
2825       // FIXME: Scavenger should figure out that the result register is
2826       // available. Also should do this for the v_add case.
2827       if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2828         TmpReg = DstOp.getReg();
2829 
2830       if (FrameReg && !ST.enableFlatScratch()) {
2831         // FIXME: In the common case where the add does not also read its result
2832         // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2833         // available.
2834         if (!TmpReg)
2835           TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2836                                                  MI, /*RestoreAfter=*/false, 0,
2837                                                  /*AllowSpill=*/false);
2838         if (TmpReg) {
2839           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2840               .addDef(TmpReg, RegState::Renamable)
2841               .addReg(FrameReg)
2842               .addImm(ST.getWavefrontSizeLog2())
2843               .setOperandDead(3); // Set SCC dead
2844         }
2845         MaterializedReg = TmpReg;
2846       }
2847 
2848       int64_t Offset = FrameInfo.getObjectOffset(Index);
2849 
2850       // For the non-immediate case, we could fall through to the default
2851       // handling, but we do an in-place update of the result register here to
2852       // avoid scavenging another register.
2853       if (OtherOp.isImm()) {
2854         OtherOp.setImm(OtherOp.getImm() + Offset);
2855         Offset = 0;
2856 
2857         if (MaterializedReg)
2858           FIOp->ChangeToRegister(MaterializedReg, false);
2859         else
2860           FIOp->ChangeToImmediate(0);
2861       } else if (MaterializedReg) {
2862         // If we can't fold the other operand, do another increment.
2863         Register DstReg = DstOp.getReg();
2864 
2865         if (!TmpReg && MaterializedReg == FrameReg) {
2866           TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2867                                                  MI, /*RestoreAfter=*/false, 0,
2868                                                  /*AllowSpill=*/false);
2869           DstReg = TmpReg;
2870         }
2871 
2872         if (TmpReg) {
2873           auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2874                             .addDef(DstReg, RegState::Renamable)
2875                             .addReg(MaterializedReg, RegState::Kill)
2876                             .add(OtherOp);
2877           if (DeadSCC)
2878             AddI32.setOperandDead(3);
2879 
2880           MaterializedReg = DstReg;
2881 
2882           OtherOp.ChangeToRegister(MaterializedReg, false);
2883           OtherOp.setIsKill(true);
2884           OtherOp.setIsRenamable(true);
2885         }
2886         FIOp->ChangeToImmediate(Offset);
2887       } else {
2888         // If we don't have any other offset to apply, we can just directly
2889         // interpret the frame index as the offset.
2890         FIOp->ChangeToImmediate(Offset);
2891       }
2892 
2893       if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2894         assert(Offset == 0);
2895         MI->removeOperand(3);
2896         MI->removeOperand(OtherOpIdx);
2897         MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2898       } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2899         assert(Offset == 0);
2900         MI->removeOperand(3);
2901         MI->removeOperand(FIOperandNum);
2902         MI->setDesc(
2903             TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2904       }
2905 
2906       assert(!FIOp->isFI());
2907       return true;
2908     }
2909     default: {
2910       break;
2911     }
2912     }
2913 
2914     int64_t Offset = FrameInfo.getObjectOffset(Index);
2915     if (ST.enableFlatScratch()) {
2916       if (TII->isFLATScratch(*MI)) {
2917         assert(
2918             (int16_t)FIOperandNum ==
2919             AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2920 
2921         // The offset is always swizzled, just replace it
2922         if (FrameReg)
2923           FIOp->ChangeToRegister(FrameReg, false);
2924 
2925         MachineOperand *OffsetOp =
2926             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2927         int64_t NewOffset = Offset + OffsetOp->getImm();
2928         if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2929                                    SIInstrFlags::FlatScratch)) {
2930           OffsetOp->setImm(NewOffset);
2931           if (FrameReg)
2932             return false;
2933           Offset = 0;
2934         }
2935 
2936         if (!Offset) {
2937           unsigned Opc = MI->getOpcode();
2938           int NewOpc = -1;
2939           if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2940             NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
2941           } else if (ST.hasFlatScratchSTMode()) {
2942             // On GFX10 we have ST mode to use no registers for an address.
2943             // Otherwise we need to materialize 0 into an SGPR.
2944             NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
2945           }
2946 
2947           if (NewOpc != -1) {
2948             // removeOperand doesn't fixup tied operand indexes as it goes, so
2949             // it asserts. Untie vdst_in for now and retie them afterwards.
2950             int VDstIn =
2951                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2952             bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2953                             MI->getOperand(VDstIn).isTied();
2954             if (TiedVDst)
2955               MI->untieRegOperand(VDstIn);
2956 
2957             MI->removeOperand(
2958                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2959 
2960             if (TiedVDst) {
2961               int NewVDst =
2962                   AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2963               int NewVDstIn =
2964                   AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2965               assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2966               MI->tieOperands(NewVDst, NewVDstIn);
2967             }
2968             MI->setDesc(TII->get(NewOpc));
2969             return false;
2970           }
2971         }
2972       }
2973 
2974       if (!FrameReg) {
2975         FIOp->ChangeToImmediate(Offset);
2976         if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2977           return false;
2978       }
2979 
2980       // We need to use register here. Check if we can use an SGPR or need
2981       // a VGPR.
2982       FIOp->ChangeToRegister(AMDGPU::M0, false);
2983       bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2984 
2985       if (!Offset && FrameReg && UseSGPR) {
2986         FIOp->setReg(FrameReg);
2987         return false;
2988       }
2989 
2990       const TargetRegisterClass *RC =
2991           UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2992 
2993       Register TmpReg =
2994           RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2995       FIOp->setReg(TmpReg);
2996       FIOp->setIsKill();
2997 
2998       if ((!FrameReg || !Offset) && TmpReg) {
2999         unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
3000         auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
3001         if (FrameReg)
3002           MIB.addReg(FrameReg);
3003         else
3004           MIB.addImm(Offset);
3005 
3006         return false;
3007       }
3008 
3009       bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3010                          !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3011 
3012       Register TmpSReg =
3013           UseSGPR ? TmpReg
3014                   : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3015                                                   MI, false, 0, !UseSGPR);
3016 
3017       // TODO: for flat scratch another attempt can be made with a VGPR index
3018       //       if no SGPRs can be scavenged.
3019       if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
3020         report_fatal_error("Cannot scavenge register in FI elimination!");
3021 
3022       if (!TmpSReg) {
3023         // Use frame register and restore it after.
3024         TmpSReg = FrameReg;
3025         FIOp->setReg(FrameReg);
3026         FIOp->setIsKill(false);
3027       }
3028 
3029       if (NeedSaveSCC) {
3030         assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3031         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3032             .addReg(FrameReg)
3033             .addImm(Offset);
3034         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3035             .addReg(TmpSReg)
3036             .addImm(0);
3037         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3038             .addImm(0)
3039             .addReg(TmpSReg);
3040       } else {
3041         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3042             .addReg(FrameReg)
3043             .addImm(Offset);
3044       }
3045 
3046       if (!UseSGPR)
3047         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3048             .addReg(TmpSReg, RegState::Kill);
3049 
3050       if (TmpSReg == FrameReg) {
3051         // Undo frame register modification.
3052         if (NeedSaveSCC &&
3053             !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3054           MachineBasicBlock::iterator I =
3055               BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3056                       TmpSReg)
3057                   .addReg(FrameReg)
3058                   .addImm(-Offset);
3059           I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3060                   .addReg(TmpSReg)
3061                   .addImm(0);
3062           BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3063                   TmpSReg)
3064               .addImm(0)
3065               .addReg(TmpSReg);
3066         } else {
3067           BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3068                   FrameReg)
3069               .addReg(FrameReg)
3070               .addImm(-Offset);
3071         }
3072       }
3073 
3074       return false;
3075     }
3076 
3077     bool IsMUBUF = TII->isMUBUF(*MI);
3078 
3079     if (!IsMUBUF && !MFI->isBottomOfStack()) {
3080       // Convert to a swizzled stack address by scaling by the wave size.
3081       // In an entry function/kernel the offset is already swizzled.
3082       bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
3083       bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3084                      !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3085       const TargetRegisterClass *RC = IsSALU && !LiveSCC
3086                                           ? &AMDGPU::SReg_32RegClass
3087                                           : &AMDGPU::VGPR_32RegClass;
3088       bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3089                     MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3090                     MI->getOpcode() == AMDGPU::S_MOV_B32;
3091       Register ResultReg =
3092           IsCopy ? MI->getOperand(0).getReg()
3093                  : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3094 
3095       int64_t Offset = FrameInfo.getObjectOffset(Index);
3096       if (Offset == 0) {
3097         unsigned OpCode =
3098             IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3099         Register TmpResultReg = ResultReg;
3100         if (IsSALU && LiveSCC) {
3101           TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3102                                                        MI, false, 0);
3103         }
3104 
3105         auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3106         if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3107           // For V_LSHRREV, the operands are reversed (the shift count goes
3108           // first).
3109           Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3110         else
3111           Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3112         if (IsSALU && !LiveSCC)
3113           Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3114         if (IsSALU && LiveSCC) {
3115           Register NewDest;
3116           if (IsCopy) {
3117             MF->getRegInfo().constrainRegClass(ResultReg,
3118                                                &AMDGPU::SReg_32_XM0RegClass);
3119             NewDest = ResultReg;
3120           } else {
3121             NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3122                                                     Shift, false, 0);
3123           }
3124           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3125               .addReg(TmpResultReg);
3126           ResultReg = NewDest;
3127         }
3128       } else {
3129         MachineInstrBuilder MIB;
3130         if (!IsSALU) {
3131           if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3132               nullptr) {
3133             // Reuse ResultReg in intermediate step.
3134             Register ScaledReg = ResultReg;
3135 
3136             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3137                     ScaledReg)
3138                 .addImm(ST.getWavefrontSizeLog2())
3139                 .addReg(FrameReg);
3140 
3141             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3142 
3143             // TODO: Fold if use instruction is another add of a constant.
3144             if (IsVOP2 ||
3145                 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3146               // FIXME: This can fail
3147               MIB.addImm(Offset);
3148               MIB.addReg(ScaledReg, RegState::Kill);
3149               if (!IsVOP2)
3150                 MIB.addImm(0); // clamp bit
3151             } else {
3152               assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3153                      "Need to reuse carry out register");
3154 
3155               // Use scavenged unused carry out as offset register.
3156               Register ConstOffsetReg;
3157               if (!isWave32)
3158                 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3159               else
3160                 ConstOffsetReg = MIB.getReg(1);
3161 
3162               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3163                       ConstOffsetReg)
3164                   .addImm(Offset);
3165               MIB.addReg(ConstOffsetReg, RegState::Kill);
3166               MIB.addReg(ScaledReg, RegState::Kill);
3167               MIB.addImm(0); // clamp bit
3168             }
3169           }
3170         }
3171         if (!MIB || IsSALU) {
3172           // We have to produce a carry out, and there isn't a free SGPR pair
3173           // for it. We can keep the whole computation on the SALU to avoid
3174           // clobbering an additional register at the cost of an extra mov.
3175 
3176           // We may have 1 free scratch SGPR even though a carry out is
3177           // unavailable. Only one additional mov is needed.
3178           Register TmpScaledReg = IsCopy && IsSALU
3179                                       ? ResultReg
3180                                       : RS->scavengeRegisterBackwards(
3181                                             AMDGPU::SReg_32_XM0RegClass, MI,
3182                                             false, 0, /*AllowSpill=*/false);
3183           Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3184           Register TmpResultReg = ScaledReg;
3185 
3186           if (!LiveSCC) {
3187             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3188                 .addReg(FrameReg)
3189                 .addImm(ST.getWavefrontSizeLog2());
3190             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3191                 .addReg(TmpResultReg, RegState::Kill)
3192                 .addImm(Offset);
3193           } else {
3194             TmpResultReg = RS->scavengeRegisterBackwards(
3195                 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3196 
3197             MachineInstrBuilder Add;
3198             if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3199               BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3200                       TmpResultReg)
3201                   .addImm(ST.getWavefrontSizeLog2())
3202                   .addReg(FrameReg);
3203               if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3204                 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3205                     .addImm(Offset);
3206                 Add.addReg(ResultReg, RegState::Kill)
3207                     .addReg(TmpResultReg, RegState::Kill)
3208                     .addImm(0);
3209               } else
3210                 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3211             } else {
3212               assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3213                      "offset is unsafe for v_mad_u32_u24");
3214 
3215               // We start with a frame pointer with a wave space value, and
3216               // an offset in lane-space. We are materializing a lane space
3217               // value. We can either do a right shift of the frame pointer
3218               // to get to lane space, or a left shift of the offset to get
3219               // to wavespace. We can right shift after the computation to
3220               // get back to the desired per-lane value. We are using the
3221               // mad_u32_u24 primarily as an add with no carry out clobber.
3222               bool IsInlinableLiteral =
3223                   AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3224               if (!IsInlinableLiteral) {
3225                 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3226                         TmpResultReg)
3227                     .addImm(Offset);
3228               }
3229 
3230               Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3231                             TmpResultReg);
3232 
3233               if (!IsInlinableLiteral) {
3234                 Add.addReg(TmpResultReg, RegState::Kill);
3235               } else {
3236                 // We fold the offset into mad itself if its inlinable.
3237                 Add.addImm(Offset);
3238               }
3239               Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3240               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3241                       TmpResultReg)
3242                   .addImm(ST.getWavefrontSizeLog2())
3243                   .addReg(TmpResultReg);
3244             }
3245 
3246             Register NewDest;
3247             if (IsCopy) {
3248               MF->getRegInfo().constrainRegClass(ResultReg,
3249                                                  &AMDGPU::SReg_32_XM0RegClass);
3250               NewDest = ResultReg;
3251             } else {
3252               NewDest = RS->scavengeRegisterBackwards(
3253                   AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3254                   /*AllowSpill=*/true);
3255             }
3256 
3257             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3258                     NewDest)
3259                 .addReg(TmpResultReg);
3260             ResultReg = NewDest;
3261           }
3262           if (!IsSALU)
3263             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3264                 .addReg(TmpResultReg, RegState::Kill);
3265           else
3266             ResultReg = TmpResultReg;
3267           // If there were truly no free SGPRs, we need to undo everything.
3268           if (!TmpScaledReg.isValid()) {
3269             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3270                 .addReg(ScaledReg, RegState::Kill)
3271                 .addImm(-Offset);
3272             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3273                 .addReg(FrameReg)
3274                 .addImm(ST.getWavefrontSizeLog2());
3275           }
3276         }
3277       }
3278 
3279       // Don't introduce an extra copy if we're just materializing in a mov.
3280       if (IsCopy) {
3281         MI->eraseFromParent();
3282         return true;
3283       }
3284       FIOp->ChangeToRegister(ResultReg, false, false, true);
3285       return false;
3286     }
3287 
3288     if (IsMUBUF) {
3289       // Disable offen so we don't need a 0 vgpr base.
3290       assert(
3291           static_cast<int>(FIOperandNum) ==
3292           AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3293 
3294       auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3295       assert((SOffset.isImm() && SOffset.getImm() == 0));
3296 
3297       if (FrameReg != AMDGPU::NoRegister)
3298         SOffset.ChangeToRegister(FrameReg, false);
3299 
3300       int64_t Offset = FrameInfo.getObjectOffset(Index);
3301       int64_t OldImm =
3302           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3303       int64_t NewOffset = OldImm + Offset;
3304 
3305       if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3306           buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3307         MI->eraseFromParent();
3308         return true;
3309       }
3310     }
3311 
3312     // If the offset is simply too big, don't convert to a scratch wave offset
3313     // relative index.
3314 
3315     FIOp->ChangeToImmediate(Offset);
3316     if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3317       Register TmpReg =
3318           RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3319       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3320           .addImm(Offset);
3321       FIOp->ChangeToRegister(TmpReg, false, false, true);
3322     }
3323 
3324   return false;
3325 }
3326 
getRegAsmName(MCRegister Reg) const3327 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
3328   return AMDGPUInstPrinter::getRegisterName(Reg);
3329 }
3330 
getRegBitWidth(const TargetRegisterClass & RC)3331 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
3332   return getRegBitWidth(RC.getID());
3333 }
3334 
3335 static const TargetRegisterClass *
getAnyVGPRClassForBitWidth(unsigned BitWidth)3336 getAnyVGPRClassForBitWidth(unsigned BitWidth) {
3337   if (BitWidth == 64)
3338     return &AMDGPU::VReg_64RegClass;
3339   if (BitWidth == 96)
3340     return &AMDGPU::VReg_96RegClass;
3341   if (BitWidth == 128)
3342     return &AMDGPU::VReg_128RegClass;
3343   if (BitWidth == 160)
3344     return &AMDGPU::VReg_160RegClass;
3345   if (BitWidth == 192)
3346     return &AMDGPU::VReg_192RegClass;
3347   if (BitWidth == 224)
3348     return &AMDGPU::VReg_224RegClass;
3349   if (BitWidth == 256)
3350     return &AMDGPU::VReg_256RegClass;
3351   if (BitWidth == 288)
3352     return &AMDGPU::VReg_288RegClass;
3353   if (BitWidth == 320)
3354     return &AMDGPU::VReg_320RegClass;
3355   if (BitWidth == 352)
3356     return &AMDGPU::VReg_352RegClass;
3357   if (BitWidth == 384)
3358     return &AMDGPU::VReg_384RegClass;
3359   if (BitWidth == 512)
3360     return &AMDGPU::VReg_512RegClass;
3361   if (BitWidth == 1024)
3362     return &AMDGPU::VReg_1024RegClass;
3363 
3364   return nullptr;
3365 }
3366 
3367 static const TargetRegisterClass *
getAlignedVGPRClassForBitWidth(unsigned BitWidth)3368 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
3369   if (BitWidth == 64)
3370     return &AMDGPU::VReg_64_Align2RegClass;
3371   if (BitWidth == 96)
3372     return &AMDGPU::VReg_96_Align2RegClass;
3373   if (BitWidth == 128)
3374     return &AMDGPU::VReg_128_Align2RegClass;
3375   if (BitWidth == 160)
3376     return &AMDGPU::VReg_160_Align2RegClass;
3377   if (BitWidth == 192)
3378     return &AMDGPU::VReg_192_Align2RegClass;
3379   if (BitWidth == 224)
3380     return &AMDGPU::VReg_224_Align2RegClass;
3381   if (BitWidth == 256)
3382     return &AMDGPU::VReg_256_Align2RegClass;
3383   if (BitWidth == 288)
3384     return &AMDGPU::VReg_288_Align2RegClass;
3385   if (BitWidth == 320)
3386     return &AMDGPU::VReg_320_Align2RegClass;
3387   if (BitWidth == 352)
3388     return &AMDGPU::VReg_352_Align2RegClass;
3389   if (BitWidth == 384)
3390     return &AMDGPU::VReg_384_Align2RegClass;
3391   if (BitWidth == 512)
3392     return &AMDGPU::VReg_512_Align2RegClass;
3393   if (BitWidth == 1024)
3394     return &AMDGPU::VReg_1024_Align2RegClass;
3395 
3396   return nullptr;
3397 }
3398 
3399 const TargetRegisterClass *
getVGPRClassForBitWidth(unsigned BitWidth) const3400 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
3401   if (BitWidth == 1)
3402     return &AMDGPU::VReg_1RegClass;
3403   if (BitWidth == 16)
3404     return &AMDGPU::VGPR_16RegClass;
3405   if (BitWidth == 32)
3406     return &AMDGPU::VGPR_32RegClass;
3407   return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3408                                 : getAnyVGPRClassForBitWidth(BitWidth);
3409 }
3410 
3411 static const TargetRegisterClass *
getAnyAGPRClassForBitWidth(unsigned BitWidth)3412 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
3413   if (BitWidth == 64)
3414     return &AMDGPU::AReg_64RegClass;
3415   if (BitWidth == 96)
3416     return &AMDGPU::AReg_96RegClass;
3417   if (BitWidth == 128)
3418     return &AMDGPU::AReg_128RegClass;
3419   if (BitWidth == 160)
3420     return &AMDGPU::AReg_160RegClass;
3421   if (BitWidth == 192)
3422     return &AMDGPU::AReg_192RegClass;
3423   if (BitWidth == 224)
3424     return &AMDGPU::AReg_224RegClass;
3425   if (BitWidth == 256)
3426     return &AMDGPU::AReg_256RegClass;
3427   if (BitWidth == 288)
3428     return &AMDGPU::AReg_288RegClass;
3429   if (BitWidth == 320)
3430     return &AMDGPU::AReg_320RegClass;
3431   if (BitWidth == 352)
3432     return &AMDGPU::AReg_352RegClass;
3433   if (BitWidth == 384)
3434     return &AMDGPU::AReg_384RegClass;
3435   if (BitWidth == 512)
3436     return &AMDGPU::AReg_512RegClass;
3437   if (BitWidth == 1024)
3438     return &AMDGPU::AReg_1024RegClass;
3439 
3440   return nullptr;
3441 }
3442 
3443 static const TargetRegisterClass *
getAlignedAGPRClassForBitWidth(unsigned BitWidth)3444 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
3445   if (BitWidth == 64)
3446     return &AMDGPU::AReg_64_Align2RegClass;
3447   if (BitWidth == 96)
3448     return &AMDGPU::AReg_96_Align2RegClass;
3449   if (BitWidth == 128)
3450     return &AMDGPU::AReg_128_Align2RegClass;
3451   if (BitWidth == 160)
3452     return &AMDGPU::AReg_160_Align2RegClass;
3453   if (BitWidth == 192)
3454     return &AMDGPU::AReg_192_Align2RegClass;
3455   if (BitWidth == 224)
3456     return &AMDGPU::AReg_224_Align2RegClass;
3457   if (BitWidth == 256)
3458     return &AMDGPU::AReg_256_Align2RegClass;
3459   if (BitWidth == 288)
3460     return &AMDGPU::AReg_288_Align2RegClass;
3461   if (BitWidth == 320)
3462     return &AMDGPU::AReg_320_Align2RegClass;
3463   if (BitWidth == 352)
3464     return &AMDGPU::AReg_352_Align2RegClass;
3465   if (BitWidth == 384)
3466     return &AMDGPU::AReg_384_Align2RegClass;
3467   if (BitWidth == 512)
3468     return &AMDGPU::AReg_512_Align2RegClass;
3469   if (BitWidth == 1024)
3470     return &AMDGPU::AReg_1024_Align2RegClass;
3471 
3472   return nullptr;
3473 }
3474 
3475 const TargetRegisterClass *
getAGPRClassForBitWidth(unsigned BitWidth) const3476 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
3477   if (BitWidth == 16)
3478     return &AMDGPU::AGPR_LO16RegClass;
3479   if (BitWidth == 32)
3480     return &AMDGPU::AGPR_32RegClass;
3481   return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3482                                 : getAnyAGPRClassForBitWidth(BitWidth);
3483 }
3484 
3485 static const TargetRegisterClass *
getAnyVectorSuperClassForBitWidth(unsigned BitWidth)3486 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
3487   if (BitWidth == 64)
3488     return &AMDGPU::AV_64RegClass;
3489   if (BitWidth == 96)
3490     return &AMDGPU::AV_96RegClass;
3491   if (BitWidth == 128)
3492     return &AMDGPU::AV_128RegClass;
3493   if (BitWidth == 160)
3494     return &AMDGPU::AV_160RegClass;
3495   if (BitWidth == 192)
3496     return &AMDGPU::AV_192RegClass;
3497   if (BitWidth == 224)
3498     return &AMDGPU::AV_224RegClass;
3499   if (BitWidth == 256)
3500     return &AMDGPU::AV_256RegClass;
3501   if (BitWidth == 288)
3502     return &AMDGPU::AV_288RegClass;
3503   if (BitWidth == 320)
3504     return &AMDGPU::AV_320RegClass;
3505   if (BitWidth == 352)
3506     return &AMDGPU::AV_352RegClass;
3507   if (BitWidth == 384)
3508     return &AMDGPU::AV_384RegClass;
3509   if (BitWidth == 512)
3510     return &AMDGPU::AV_512RegClass;
3511   if (BitWidth == 1024)
3512     return &AMDGPU::AV_1024RegClass;
3513 
3514   return nullptr;
3515 }
3516 
3517 static const TargetRegisterClass *
getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)3518 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
3519   if (BitWidth == 64)
3520     return &AMDGPU::AV_64_Align2RegClass;
3521   if (BitWidth == 96)
3522     return &AMDGPU::AV_96_Align2RegClass;
3523   if (BitWidth == 128)
3524     return &AMDGPU::AV_128_Align2RegClass;
3525   if (BitWidth == 160)
3526     return &AMDGPU::AV_160_Align2RegClass;
3527   if (BitWidth == 192)
3528     return &AMDGPU::AV_192_Align2RegClass;
3529   if (BitWidth == 224)
3530     return &AMDGPU::AV_224_Align2RegClass;
3531   if (BitWidth == 256)
3532     return &AMDGPU::AV_256_Align2RegClass;
3533   if (BitWidth == 288)
3534     return &AMDGPU::AV_288_Align2RegClass;
3535   if (BitWidth == 320)
3536     return &AMDGPU::AV_320_Align2RegClass;
3537   if (BitWidth == 352)
3538     return &AMDGPU::AV_352_Align2RegClass;
3539   if (BitWidth == 384)
3540     return &AMDGPU::AV_384_Align2RegClass;
3541   if (BitWidth == 512)
3542     return &AMDGPU::AV_512_Align2RegClass;
3543   if (BitWidth == 1024)
3544     return &AMDGPU::AV_1024_Align2RegClass;
3545 
3546   return nullptr;
3547 }
3548 
3549 const TargetRegisterClass *
getVectorSuperClassForBitWidth(unsigned BitWidth) const3550 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
3551   if (BitWidth == 32)
3552     return &AMDGPU::AV_32RegClass;
3553   return ST.needsAlignedVGPRs()
3554              ? getAlignedVectorSuperClassForBitWidth(BitWidth)
3555              : getAnyVectorSuperClassForBitWidth(BitWidth);
3556 }
3557 
3558 const TargetRegisterClass *
getSGPRClassForBitWidth(unsigned BitWidth)3559 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
3560   if (BitWidth == 16 || BitWidth == 32)
3561     return &AMDGPU::SReg_32RegClass;
3562   if (BitWidth == 64)
3563     return &AMDGPU::SReg_64RegClass;
3564   if (BitWidth == 96)
3565     return &AMDGPU::SGPR_96RegClass;
3566   if (BitWidth == 128)
3567     return &AMDGPU::SGPR_128RegClass;
3568   if (BitWidth == 160)
3569     return &AMDGPU::SGPR_160RegClass;
3570   if (BitWidth == 192)
3571     return &AMDGPU::SGPR_192RegClass;
3572   if (BitWidth == 224)
3573     return &AMDGPU::SGPR_224RegClass;
3574   if (BitWidth == 256)
3575     return &AMDGPU::SGPR_256RegClass;
3576   if (BitWidth == 288)
3577     return &AMDGPU::SGPR_288RegClass;
3578   if (BitWidth == 320)
3579     return &AMDGPU::SGPR_320RegClass;
3580   if (BitWidth == 352)
3581     return &AMDGPU::SGPR_352RegClass;
3582   if (BitWidth == 384)
3583     return &AMDGPU::SGPR_384RegClass;
3584   if (BitWidth == 512)
3585     return &AMDGPU::SGPR_512RegClass;
3586   if (BitWidth == 1024)
3587     return &AMDGPU::SGPR_1024RegClass;
3588 
3589   return nullptr;
3590 }
3591 
isSGPRReg(const MachineRegisterInfo & MRI,Register Reg) const3592 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
3593                                Register Reg) const {
3594   const TargetRegisterClass *RC;
3595   if (Reg.isVirtual())
3596     RC = MRI.getRegClass(Reg);
3597   else
3598     RC = getPhysRegBaseClass(Reg);
3599   return RC && isSGPRClass(RC);
3600 }
3601 
3602 const TargetRegisterClass *
getEquivalentVGPRClass(const TargetRegisterClass * SRC) const3603 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
3604   unsigned Size = getRegSizeInBits(*SRC);
3605   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
3606   assert(VRC && "Invalid register class size");
3607   return VRC;
3608 }
3609 
3610 const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass * SRC) const3611 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
3612   unsigned Size = getRegSizeInBits(*SRC);
3613   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
3614   assert(ARC && "Invalid register class size");
3615   return ARC;
3616 }
3617 
3618 const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass * VRC) const3619 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
3620   unsigned Size = getRegSizeInBits(*VRC);
3621   if (Size == 32)
3622     return &AMDGPU::SGPR_32RegClass;
3623   const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
3624   assert(SRC && "Invalid register class size");
3625   return SRC;
3626 }
3627 
3628 const TargetRegisterClass *
getCompatibleSubRegClass(const TargetRegisterClass * SuperRC,const TargetRegisterClass * SubRC,unsigned SubIdx) const3629 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
3630                                          const TargetRegisterClass *SubRC,
3631                                          unsigned SubIdx) const {
3632   // Ensure this subregister index is aligned in the super register.
3633   const TargetRegisterClass *MatchRC =
3634       getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3635   return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3636 }
3637 
opCanUseInlineConstant(unsigned OpType) const3638 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3639   if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
3640       OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
3641     return !ST.hasMFMAInlineLiteralBug();
3642 
3643   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3644          OpType <= AMDGPU::OPERAND_SRC_LAST;
3645 }
3646 
opCanUseLiteralConstant(unsigned OpType) const3647 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3648   // TODO: 64-bit operands have extending behavior from 32-bit literal.
3649   return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3650          OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
3651 }
3652 
3653 /// Returns a lowest register that is not used at any point in the function.
3654 ///        If all registers are used, then this function will return
3655 ///         AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3656 ///         highest unused register.
findUnusedRegister(const MachineRegisterInfo & MRI,const TargetRegisterClass * RC,const MachineFunction & MF,bool ReserveHighestRegister) const3657 MCRegister SIRegisterInfo::findUnusedRegister(
3658     const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3659     const MachineFunction &MF, bool ReserveHighestRegister) const {
3660   if (ReserveHighestRegister) {
3661     for (MCRegister Reg : reverse(*RC))
3662       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3663         return Reg;
3664   } else {
3665     for (MCRegister Reg : *RC)
3666       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3667         return Reg;
3668   }
3669   return MCRegister();
3670 }
3671 
isUniformReg(const MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register Reg) const3672 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
3673                                   const RegisterBankInfo &RBI,
3674                                   Register Reg) const {
3675   auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3676   if (!RB)
3677     return false;
3678 
3679   return !RBI.isDivergentRegBank(RB);
3680 }
3681 
getRegSplitParts(const TargetRegisterClass * RC,unsigned EltSize) const3682 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
3683                                                    unsigned EltSize) const {
3684   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3685   assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3686 
3687   const unsigned RegHalves = RegBitWidth / 16;
3688   const unsigned EltHalves = EltSize / 2;
3689   assert(RegSplitParts.size() + 1 >= EltHalves);
3690 
3691   const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3692   const unsigned NumParts = RegHalves / EltHalves;
3693 
3694   return ArrayRef(Parts.data(), NumParts);
3695 }
3696 
3697 const TargetRegisterClass*
getRegClassForReg(const MachineRegisterInfo & MRI,Register Reg) const3698 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
3699                                   Register Reg) const {
3700   return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3701 }
3702 
3703 const TargetRegisterClass *
getRegClassForOperandReg(const MachineRegisterInfo & MRI,const MachineOperand & MO) const3704 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI,
3705                                          const MachineOperand &MO) const {
3706   const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3707   return getSubRegisterClass(SrcRC, MO.getSubReg());
3708 }
3709 
isVGPR(const MachineRegisterInfo & MRI,Register Reg) const3710 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
3711                             Register Reg) const {
3712   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3713   // Registers without classes are unaddressable, SGPR-like registers.
3714   return RC && isVGPRClass(RC);
3715 }
3716 
isAGPR(const MachineRegisterInfo & MRI,Register Reg) const3717 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
3718                             Register Reg) const {
3719   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3720 
3721   // Registers without classes are unaddressable, SGPR-like registers.
3722   return RC && isAGPRClass(RC);
3723 }
3724 
shouldCoalesce(MachineInstr * MI,const TargetRegisterClass * SrcRC,unsigned SubReg,const TargetRegisterClass * DstRC,unsigned DstSubReg,const TargetRegisterClass * NewRC,LiveIntervals & LIS) const3725 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
3726                                     const TargetRegisterClass *SrcRC,
3727                                     unsigned SubReg,
3728                                     const TargetRegisterClass *DstRC,
3729                                     unsigned DstSubReg,
3730                                     const TargetRegisterClass *NewRC,
3731                                     LiveIntervals &LIS) const {
3732   unsigned SrcSize = getRegSizeInBits(*SrcRC);
3733   unsigned DstSize = getRegSizeInBits(*DstRC);
3734   unsigned NewSize = getRegSizeInBits(*NewRC);
3735 
3736   // Do not increase size of registers beyond dword, we would need to allocate
3737   // adjacent registers and constraint regalloc more than needed.
3738 
3739   // Always allow dword coalescing.
3740   if (SrcSize <= 32 || DstSize <= 32)
3741     return true;
3742 
3743   return NewSize <= DstSize || NewSize <= SrcSize;
3744 }
3745 
getRegPressureLimit(const TargetRegisterClass * RC,MachineFunction & MF) const3746 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
3747                                              MachineFunction &MF) const {
3748   unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3749   switch (RC->getID()) {
3750   default:
3751     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3752   case AMDGPU::VGPR_32RegClassID:
3753     return std::min(
3754         ST.getMaxNumVGPRs(
3755             MinOcc,
3756             MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()),
3757         ST.getMaxNumVGPRs(MF));
3758   case AMDGPU::SGPR_32RegClassID:
3759   case AMDGPU::SGPR_LO16RegClassID:
3760     return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3761   }
3762 }
3763 
getRegPressureSetLimit(const MachineFunction & MF,unsigned Idx) const3764 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
3765                                                 unsigned Idx) const {
3766   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3767       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3768     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3769                                const_cast<MachineFunction &>(MF));
3770 
3771   if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3772     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3773                                const_cast<MachineFunction &>(MF));
3774 
3775   llvm_unreachable("Unexpected register pressure set!");
3776 }
3777 
getRegUnitPressureSets(unsigned RegUnit) const3778 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3779   static const int Empty[] = { -1 };
3780 
3781   if (RegPressureIgnoredUnits[RegUnit])
3782     return Empty;
3783 
3784   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3785 }
3786 
getRegAllocationHints(Register VirtReg,ArrayRef<MCPhysReg> Order,SmallVectorImpl<MCPhysReg> & Hints,const MachineFunction & MF,const VirtRegMap * VRM,const LiveRegMatrix * Matrix) const3787 bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
3788                                            ArrayRef<MCPhysReg> Order,
3789                                            SmallVectorImpl<MCPhysReg> &Hints,
3790                                            const MachineFunction &MF,
3791                                            const VirtRegMap *VRM,
3792                                            const LiveRegMatrix *Matrix) const {
3793 
3794   const MachineRegisterInfo &MRI = MF.getRegInfo();
3795   const SIRegisterInfo *TRI = ST.getRegisterInfo();
3796 
3797   std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3798 
3799   switch (Hint.first) {
3800   case AMDGPURI::Size32: {
3801     Register Paired = Hint.second;
3802     assert(Paired);
3803     Register PairedPhys;
3804     if (Paired.isPhysical()) {
3805       PairedPhys =
3806           getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3807     } else if (VRM && VRM->hasPhys(Paired)) {
3808       PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3809                                        &AMDGPU::VGPR_32RegClass);
3810     }
3811 
3812     // Prefer the paired physreg.
3813     if (PairedPhys)
3814       // isLo(Paired) is implicitly true here from the API of
3815       // getMatchingSuperReg.
3816       Hints.push_back(PairedPhys);
3817     return false;
3818   }
3819   case AMDGPURI::Size16: {
3820     Register Paired = Hint.second;
3821     assert(Paired);
3822     Register PairedPhys;
3823     if (Paired.isPhysical()) {
3824       PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3825     } else if (VRM && VRM->hasPhys(Paired)) {
3826       PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3827     }
3828 
3829     // First prefer the paired physreg.
3830     if (PairedPhys)
3831       Hints.push_back(PairedPhys);
3832     else {
3833       // Add all the lo16 physregs.
3834       // When the Paired operand has not yet been assigned a physreg it is
3835       // better to try putting VirtReg in a lo16 register, because possibly
3836       // later Paired can be assigned to the overlapping register and the COPY
3837       // can be eliminated.
3838       for (MCPhysReg PhysReg : Order) {
3839         if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3840           continue;
3841         if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3842             !MRI.isReserved(PhysReg))
3843           Hints.push_back(PhysReg);
3844       }
3845     }
3846     return false;
3847   }
3848   default:
3849     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3850                                                      VRM);
3851   }
3852 }
3853 
getReturnAddressReg(const MachineFunction & MF) const3854 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
3855   // Not a callee saved register.
3856   return AMDGPU::SGPR30_SGPR31;
3857 }
3858 
3859 const TargetRegisterClass *
getRegClassForSizeOnBank(unsigned Size,const RegisterBank & RB) const3860 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
3861                                          const RegisterBank &RB) const {
3862   switch (RB.getID()) {
3863   case AMDGPU::VGPRRegBankID:
3864     return getVGPRClassForBitWidth(
3865         std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3866   case AMDGPU::VCCRegBankID:
3867     assert(Size == 1);
3868     return getWaveMaskRegClass();
3869   case AMDGPU::SGPRRegBankID:
3870     return getSGPRClassForBitWidth(std::max(32u, Size));
3871   case AMDGPU::AGPRRegBankID:
3872     return getAGPRClassForBitWidth(std::max(32u, Size));
3873   default:
3874     llvm_unreachable("unknown register bank");
3875   }
3876 }
3877 
3878 const TargetRegisterClass *
getConstrainedRegClassForOperand(const MachineOperand & MO,const MachineRegisterInfo & MRI) const3879 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
3880                                          const MachineRegisterInfo &MRI) const {
3881   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3882   if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3883     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3884 
3885   if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3886     return getAllocatableClass(RC);
3887 
3888   return nullptr;
3889 }
3890 
getVCC() const3891 MCRegister SIRegisterInfo::getVCC() const {
3892   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3893 }
3894 
getExec() const3895 MCRegister SIRegisterInfo::getExec() const {
3896   return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3897 }
3898 
getVGPR64Class() const3899 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
3900   // VGPR tuples have an alignment requirement on gfx90a variants.
3901   return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3902                                 : &AMDGPU::VReg_64RegClass;
3903 }
3904 
3905 const TargetRegisterClass *
getRegClass(unsigned RCID) const3906 SIRegisterInfo::getRegClass(unsigned RCID) const {
3907   switch ((int)RCID) {
3908   case AMDGPU::SReg_1RegClassID:
3909     return getBoolRC();
3910   case AMDGPU::SReg_1_XEXECRegClassID:
3911     return getWaveMaskRegClass();
3912   case -1:
3913     return nullptr;
3914   default:
3915     return AMDGPUGenRegisterInfo::getRegClass(RCID);
3916   }
3917 }
3918 
3919 // Find reaching register definition
findReachingDef(Register Reg,unsigned SubReg,MachineInstr & Use,MachineRegisterInfo & MRI,LiveIntervals * LIS) const3920 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
3921                                               MachineInstr &Use,
3922                                               MachineRegisterInfo &MRI,
3923                                               LiveIntervals *LIS) const {
3924   auto &MDT = LIS->getDomTree();
3925   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3926   SlotIndex DefIdx;
3927 
3928   if (Reg.isVirtual()) {
3929     if (!LIS->hasInterval(Reg))
3930       return nullptr;
3931     LiveInterval &LI = LIS->getInterval(Reg);
3932     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3933                                   : MRI.getMaxLaneMaskForVReg(Reg);
3934     VNInfo *V = nullptr;
3935     if (LI.hasSubRanges()) {
3936       for (auto &S : LI.subranges()) {
3937         if ((S.LaneMask & SubLanes) == SubLanes) {
3938           V = S.getVNInfoAt(UseIdx);
3939           break;
3940         }
3941       }
3942     } else {
3943       V = LI.getVNInfoAt(UseIdx);
3944     }
3945     if (!V)
3946       return nullptr;
3947     DefIdx = V->def;
3948   } else {
3949     // Find last def.
3950     for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3951       LiveRange &LR = LIS->getRegUnit(Unit);
3952       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3953         if (!DefIdx.isValid() ||
3954             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3955                           LIS->getInstructionFromIndex(V->def)))
3956           DefIdx = V->def;
3957       } else {
3958         return nullptr;
3959       }
3960     }
3961   }
3962 
3963   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3964 
3965   if (!Def || !MDT.dominates(Def, &Use))
3966     return nullptr;
3967 
3968   assert(Def->modifiesRegister(Reg, this));
3969 
3970   return Def;
3971 }
3972 
get32BitRegister(MCPhysReg Reg) const3973 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
3974   assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3975 
3976   for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3977                                          AMDGPU::SReg_32RegClass,
3978                                          AMDGPU::AGPR_32RegClass } ) {
3979     if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3980       return Super;
3981   }
3982   if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3983                                             &AMDGPU::VGPR_32RegClass)) {
3984       return Super;
3985   }
3986 
3987   return AMDGPU::NoRegister;
3988 }
3989 
isProperlyAlignedRC(const TargetRegisterClass & RC) const3990 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
3991   if (!ST.needsAlignedVGPRs())
3992     return true;
3993 
3994   if (isVGPRClass(&RC))
3995     return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3996   if (isAGPRClass(&RC))
3997     return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3998   if (isVectorSuperClass(&RC))
3999     return RC.hasSuperClassEq(
4000         getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4001 
4002   return true;
4003 }
4004 
4005 const TargetRegisterClass *
getProperlyAlignedRC(const TargetRegisterClass * RC) const4006 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
4007   if (!RC || !ST.needsAlignedVGPRs())
4008     return RC;
4009 
4010   unsigned Size = getRegSizeInBits(*RC);
4011   if (Size <= 32)
4012     return RC;
4013 
4014   if (isVGPRClass(RC))
4015     return getAlignedVGPRClassForBitWidth(Size);
4016   if (isAGPRClass(RC))
4017     return getAlignedAGPRClassForBitWidth(Size);
4018   if (isVectorSuperClass(RC))
4019     return getAlignedVectorSuperClassForBitWidth(Size);
4020 
4021   return RC;
4022 }
4023 
4024 ArrayRef<MCPhysReg>
getAllSGPR128(const MachineFunction & MF) const4025 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
4026   return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4027 }
4028 
4029 ArrayRef<MCPhysReg>
getAllSGPR64(const MachineFunction & MF) const4030 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
4031   return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4032 }
4033 
4034 ArrayRef<MCPhysReg>
getAllSGPR32(const MachineFunction & MF) const4035 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
4036   return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4037 }
4038 
4039 unsigned
getSubRegAlignmentNumBits(const TargetRegisterClass * RC,unsigned SubReg) const4040 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
4041                                           unsigned SubReg) const {
4042   switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4043   case SIRCFlags::HasSGPR:
4044     return std::min(128u, getSubRegIdxSize(SubReg));
4045   case SIRCFlags::HasAGPR:
4046   case SIRCFlags::HasVGPR:
4047   case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR:
4048     return std::min(32u, getSubRegIdxSize(SubReg));
4049   default:
4050     break;
4051   }
4052   return 0;
4053 }
4054 
4055 unsigned
getNumUsedPhysRegs(const MachineRegisterInfo & MRI,const TargetRegisterClass & RC) const4056 SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
4057                                    const TargetRegisterClass &RC) const {
4058   for (MCPhysReg Reg : reverse(RC.getRegisters()))
4059     if (MRI.isPhysRegUsed(Reg))
4060       return getHWRegIndex(Reg) + 1;
4061   return 0;
4062 }
4063 
4064 SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg,const MachineFunction & MF) const4065 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
4066                                   const MachineFunction &MF) const {
4067   SmallVector<StringLiteral> RegFlags;
4068   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4069   if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4070     RegFlags.push_back("WWM_REG");
4071   return RegFlags;
4072 }
4073