xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp (revision 9729f076e4d93c5a37e78d427bfe0f1ab99bbcc6)
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/LiveIntervals.h"
21 #include "llvm/CodeGen/LiveRegMatrix.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/InitializePasses.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "amdgpu-nsa-reassign"
28 
29 STATISTIC(NumNSAInstructions,
30           "Number of NSA instructions with non-sequential address found");
31 STATISTIC(NumNSAConverted,
32           "Number of NSA instructions changed to sequential");
33 
34 namespace {
35 
36 class GCNNSAReassign : public MachineFunctionPass {
37 public:
38   static char ID;
39 
40   GCNNSAReassign() : MachineFunctionPass(ID) {
41     initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
42   }
43 
44   bool runOnMachineFunction(MachineFunction &MF) override;
45 
46   StringRef getPassName() const override { return "GCN NSA Reassign"; }
47 
48   void getAnalysisUsage(AnalysisUsage &AU) const override {
49     AU.addRequired<LiveIntervals>();
50     AU.addRequired<VirtRegMap>();
51     AU.addRequired<LiveRegMatrix>();
52     AU.setPreservesAll();
53     MachineFunctionPass::getAnalysisUsage(AU);
54   }
55 
56 private:
57   typedef enum {
58     NOT_NSA,        // Not an NSA instruction
59     FIXED,          // NSA which we cannot modify
60     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
61                     // to optimize.
62     CONTIGUOUS      // NSA with all sequential address registers
63   } NSA_Status;
64 
65   const GCNSubtarget *ST;
66 
67   const MachineRegisterInfo *MRI;
68 
69   const SIRegisterInfo *TRI;
70 
71   VirtRegMap *VRM;
72 
73   LiveRegMatrix *LRM;
74 
75   LiveIntervals *LIS;
76 
77   unsigned MaxNumVGPRs;
78 
79   const MCPhysReg *CSRegs;
80 
81   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
82 
83   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
84                           unsigned StartReg) const;
85 
86   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
87 
88   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
89 };
90 
91 } // End anonymous namespace.
92 
93 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
94                       false, false)
95 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
96 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
97 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
98 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
99                     false, false)
100 
101 
102 char GCNNSAReassign::ID = 0;
103 
104 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
105 
106 bool
107 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
108                                    unsigned StartReg) const {
109   unsigned NumRegs = Intervals.size();
110 
111   for (unsigned N = 0; N < NumRegs; ++N)
112     if (VRM->hasPhys(Intervals[N]->reg()))
113       LRM->unassign(*Intervals[N]);
114 
115   for (unsigned N = 0; N < NumRegs; ++N)
116     if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
117       return false;
118 
119   for (unsigned N = 0; N < NumRegs; ++N)
120     LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
121 
122   return true;
123 }
124 
125 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
126   for (unsigned N = 0; N < NumRegs; ++N) {
127     unsigned Reg = StartReg + N;
128     if (!MRI->isAllocatable(Reg))
129       return false;
130 
131     for (unsigned I = 0; CSRegs[I]; ++I)
132       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
133           !LRM->isPhysRegUsed(CSRegs[I]))
134       return false;
135   }
136 
137   return true;
138 }
139 
140 bool
141 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
142   unsigned NumRegs = Intervals.size();
143 
144   if (NumRegs > MaxNumVGPRs)
145     return false;
146   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
147 
148   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
149     if (!canAssign(Reg, NumRegs))
150       continue;
151 
152     if (tryAssignRegisters(Intervals, Reg))
153       return true;
154   }
155 
156   return false;
157 }
158 
159 GCNNSAReassign::NSA_Status
160 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
161   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
162   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
163     return NSA_Status::NOT_NSA;
164 
165   int VAddr0Idx =
166     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
167 
168   unsigned VgprBase = 0;
169   bool NSA = false;
170   for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
171     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
172     Register Reg = Op.getReg();
173     if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
174       return NSA_Status::FIXED;
175 
176     Register PhysReg = VRM->getPhys(Reg);
177 
178     if (!Fast) {
179       if (!PhysReg)
180         return NSA_Status::FIXED;
181 
182       // Bail if address is not a VGPR32. That should be possible to extend the
183       // optimization to work with subregs of a wider register tuples, but the
184       // logic to find free registers will be much more complicated with much
185       // less chances for success. That seems reasonable to assume that in most
186       // cases a tuple is used because a vector variable contains different
187       // parts of an address and it is either already consequitive or cannot
188       // be reassigned if not. If needed it is better to rely on register
189       // coalescer to process such address tuples.
190       if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
191         return NSA_Status::FIXED;
192 
193       // InlineSpiller does not call LRM::assign() after an LI split leaving
194       // it in an inconsistent state, so we cannot call LRM::unassign().
195       // See llvm bug #48911.
196       // Skip reassign if a register has originated from such split.
197       // FIXME: Remove the workaround when bug #48911 is fixed.
198       if (VRM->getPreSplitReg(Reg))
199         return NSA_Status::FIXED;
200 
201       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
202 
203       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
204         return NSA_Status::FIXED;
205 
206       for (auto U : MRI->use_nodbg_operands(Reg)) {
207         if (U.isImplicit())
208           return NSA_Status::FIXED;
209         const MachineInstr *UseInst = U.getParent();
210         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
211           return NSA_Status::FIXED;
212       }
213 
214       if (!LIS->hasInterval(Reg))
215         return NSA_Status::FIXED;
216     }
217 
218     if (I == 0)
219       VgprBase = PhysReg;
220     else if (VgprBase + I != PhysReg)
221       NSA = true;
222   }
223 
224   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
225 }
226 
227 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
228   ST = &MF.getSubtarget<GCNSubtarget>();
229   if (ST->getGeneration() < GCNSubtarget::GFX10)
230     return false;
231 
232   MRI = &MF.getRegInfo();
233   TRI = ST->getRegisterInfo();
234   VRM = &getAnalysis<VirtRegMap>();
235   LRM = &getAnalysis<LiveRegMatrix>();
236   LIS = &getAnalysis<LiveIntervals>();
237 
238   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
239   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
240   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
241   CSRegs = MRI->getCalleeSavedRegs();
242 
243   using Candidate = std::pair<const MachineInstr*, bool>;
244   SmallVector<Candidate, 32> Candidates;
245   for (const MachineBasicBlock &MBB : MF) {
246     for (const MachineInstr &MI : MBB) {
247       switch (CheckNSA(MI)) {
248       default:
249         continue;
250       case NSA_Status::CONTIGUOUS:
251         Candidates.push_back(std::make_pair(&MI, true));
252         break;
253       case NSA_Status::NON_CONTIGUOUS:
254         Candidates.push_back(std::make_pair(&MI, false));
255         ++NumNSAInstructions;
256         break;
257       }
258     }
259   }
260 
261   bool Changed = false;
262   for (auto &C : Candidates) {
263     if (C.second)
264       continue;
265 
266     const MachineInstr *MI = C.first;
267     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
268       // Already happen to be fixed.
269       C.second = true;
270       ++NumNSAConverted;
271       continue;
272     }
273 
274     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
275     int VAddr0Idx =
276       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
277 
278     SmallVector<LiveInterval *, 16> Intervals;
279     SmallVector<MCRegister, 16> OrigRegs;
280     SlotIndex MinInd, MaxInd;
281     for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
282       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
283       Register Reg = Op.getReg();
284       LiveInterval *LI = &LIS->getInterval(Reg);
285       if (llvm::is_contained(Intervals, LI)) {
286         // Same register used, unable to make sequential
287         Intervals.clear();
288         break;
289       }
290       Intervals.push_back(LI);
291       OrigRegs.push_back(VRM->getPhys(Reg));
292       if (LI->empty()) {
293         // The address input is undef, so it doesn't contribute to the relevant
294         // range. Seed a reasonable index range if required.
295         if (I == 0)
296           MinInd = MaxInd = LIS->getInstructionIndex(*MI);
297         continue;
298       }
299       MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
300       MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
301     }
302 
303     if (Intervals.empty())
304       continue;
305 
306     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
307                       << "\tOriginal allocation:\t";
308                for (auto *LI
309                     : Intervals) dbgs()
310                << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
311                dbgs() << '\n');
312 
313     bool Success = scavengeRegs(Intervals);
314     if (!Success) {
315       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
316       if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
317         continue;
318     } else {
319       // Check we did not make it worse for other instructions.
320       auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
321                                 [this](const Candidate &C, SlotIndex I) {
322                                   return LIS->getInstructionIndex(*C.first) < I;
323                                 });
324       for (auto E = Candidates.end(); Success && I != E &&
325               LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
326         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
327           Success = false;
328           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
329         }
330       }
331     }
332 
333     if (!Success) {
334       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
335         if (VRM->hasPhys(Intervals[I]->reg()))
336           LRM->unassign(*Intervals[I]);
337 
338       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
339         LRM->assign(*Intervals[I], OrigRegs[I]);
340 
341       continue;
342     }
343 
344     C.second = true;
345     ++NumNSAConverted;
346     LLVM_DEBUG(
347         dbgs() << "\tNew allocation:\t\t ["
348                << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
349                << " : "
350                << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
351                << "]\n");
352     Changed = true;
353   }
354 
355   return Changed;
356 }
357