xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp (revision 7029da5c36f2d3cf6bb6c81bf551229f416399e8)
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveInterval.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/LiveRegMatrix.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/VirtRegMap.h"
26 #include "llvm/Support/MathExtras.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-nsa-reassign"
32 
33 STATISTIC(NumNSAInstructions,
34           "Number of NSA instructions with non-sequential address found");
35 STATISTIC(NumNSAConverted,
36           "Number of NSA instructions changed to sequential");
37 
38 namespace {
39 
40 class GCNNSAReassign : public MachineFunctionPass {
41 public:
42   static char ID;
43 
44   GCNNSAReassign() : MachineFunctionPass(ID) {
45     initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
46   }
47 
48   bool runOnMachineFunction(MachineFunction &MF) override;
49 
50   StringRef getPassName() const override { return "GCN NSA Reassign"; }
51 
52   void getAnalysisUsage(AnalysisUsage &AU) const override {
53     AU.addRequired<LiveIntervals>();
54     AU.addRequired<VirtRegMap>();
55     AU.addRequired<LiveRegMatrix>();
56     AU.setPreservesAll();
57     MachineFunctionPass::getAnalysisUsage(AU);
58   }
59 
60 private:
61   typedef enum {
62     NOT_NSA,        // Not an NSA instruction
63     FIXED,          // NSA which we cannot modify
64     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
65                     // to optimize.
66     CONTIGUOUS      // NSA with all sequential address registers
67   } NSA_Status;
68 
69   const GCNSubtarget *ST;
70 
71   const MachineRegisterInfo *MRI;
72 
73   const SIRegisterInfo *TRI;
74 
75   VirtRegMap *VRM;
76 
77   LiveRegMatrix *LRM;
78 
79   LiveIntervals *LIS;
80 
81   unsigned MaxNumVGPRs;
82 
83   const MCPhysReg *CSRegs;
84 
85   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
86 
87   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
88                           unsigned StartReg) const;
89 
90   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
91 
92   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
93 };
94 
95 } // End anonymous namespace.
96 
97 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
98                       false, false)
99 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
100 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
101 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
102 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
103                     false, false)
104 
105 
106 char GCNNSAReassign::ID = 0;
107 
108 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
109 
110 bool
111 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
112                                    unsigned StartReg) const {
113   unsigned NumRegs = Intervals.size();
114 
115   for (unsigned N = 0; N < NumRegs; ++N)
116     if (VRM->hasPhys(Intervals[N]->reg))
117       LRM->unassign(*Intervals[N]);
118 
119   for (unsigned N = 0; N < NumRegs; ++N)
120     if (LRM->checkInterference(*Intervals[N], StartReg + N))
121       return false;
122 
123   for (unsigned N = 0; N < NumRegs; ++N)
124     LRM->assign(*Intervals[N], StartReg + N);
125 
126   return true;
127 }
128 
129 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
130   for (unsigned N = 0; N < NumRegs; ++N) {
131     unsigned Reg = StartReg + N;
132     if (!MRI->isAllocatable(Reg))
133       return false;
134 
135     for (unsigned I = 0; CSRegs[I]; ++I)
136       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
137           !LRM->isPhysRegUsed(CSRegs[I]))
138       return false;
139   }
140 
141   return true;
142 }
143 
144 bool
145 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
146   unsigned NumRegs = Intervals.size();
147 
148   if (NumRegs > MaxNumVGPRs)
149     return false;
150   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
151 
152   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
153     if (!canAssign(Reg, NumRegs))
154       continue;
155 
156     if (tryAssignRegisters(Intervals, Reg))
157       return true;
158   }
159 
160   return false;
161 }
162 
163 GCNNSAReassign::NSA_Status
164 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
165   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
166   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
167     return NSA_Status::NOT_NSA;
168 
169   int VAddr0Idx =
170     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
171 
172   unsigned VgprBase = 0;
173   bool NSA = false;
174   for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
175     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
176     unsigned Reg = Op.getReg();
177     if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
178       return NSA_Status::FIXED;
179 
180     unsigned PhysReg = VRM->getPhys(Reg);
181 
182     if (!Fast) {
183       if (!PhysReg)
184         return NSA_Status::FIXED;
185 
186       // Bail if address is not a VGPR32. That should be possible to extend the
187       // optimization to work with subregs of a wider register tuples, but the
188       // logic to find free registers will be much more complicated with much
189       // less chances for success. That seems reasonable to assume that in most
190       // cases a tuple is used because a vector variable contains different
191       // parts of an address and it is either already consequitive or cannot
192       // be reassigned if not. If needed it is better to rely on register
193       // coalescer to process such address tuples.
194       if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
195         return NSA_Status::FIXED;
196 
197       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
198 
199       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
200         return NSA_Status::FIXED;
201 
202       for (auto U : MRI->use_nodbg_operands(Reg)) {
203         if (U.isImplicit())
204           return NSA_Status::FIXED;
205         const MachineInstr *UseInst = U.getParent();
206         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
207           return NSA_Status::FIXED;
208       }
209 
210       if (!LIS->hasInterval(Reg))
211         return NSA_Status::FIXED;
212     }
213 
214     if (I == 0)
215       VgprBase = PhysReg;
216     else if (VgprBase + I != PhysReg)
217       NSA = true;
218   }
219 
220   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
221 }
222 
223 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
224   ST = &MF.getSubtarget<GCNSubtarget>();
225   if (ST->getGeneration() < GCNSubtarget::GFX10)
226     return false;
227 
228   MRI = &MF.getRegInfo();
229   TRI = ST->getRegisterInfo();
230   VRM = &getAnalysis<VirtRegMap>();
231   LRM = &getAnalysis<LiveRegMatrix>();
232   LIS = &getAnalysis<LiveIntervals>();
233 
234   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
235   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
236   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
237   CSRegs = MRI->getCalleeSavedRegs();
238 
239   using Candidate = std::pair<const MachineInstr*, bool>;
240   SmallVector<Candidate, 32> Candidates;
241   for (const MachineBasicBlock &MBB : MF) {
242     for (const MachineInstr &MI : MBB) {
243       switch (CheckNSA(MI)) {
244       default:
245         continue;
246       case NSA_Status::CONTIGUOUS:
247         Candidates.push_back(std::make_pair(&MI, true));
248         break;
249       case NSA_Status::NON_CONTIGUOUS:
250         Candidates.push_back(std::make_pair(&MI, false));
251         ++NumNSAInstructions;
252         break;
253       }
254     }
255   }
256 
257   bool Changed = false;
258   for (auto &C : Candidates) {
259     if (C.second)
260       continue;
261 
262     const MachineInstr *MI = C.first;
263     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
264       // Already happen to be fixed.
265       C.second = true;
266       ++NumNSAConverted;
267       continue;
268     }
269 
270     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
271     int VAddr0Idx =
272       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
273 
274     SmallVector<LiveInterval *, 16> Intervals;
275     SmallVector<unsigned, 16> OrigRegs;
276     SlotIndex MinInd, MaxInd;
277     for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
278       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
279       unsigned Reg = Op.getReg();
280       LiveInterval *LI = &LIS->getInterval(Reg);
281       if (llvm::find(Intervals, LI) != Intervals.end()) {
282         // Same register used, unable to make sequential
283         Intervals.clear();
284         break;
285       }
286       Intervals.push_back(LI);
287       OrigRegs.push_back(VRM->getPhys(Reg));
288       MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
289       MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
290     }
291 
292     if (Intervals.empty())
293       continue;
294 
295     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
296                       << "\tOriginal allocation:\t";
297                for(auto *LI : Intervals)
298                  dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
299                dbgs() << '\n');
300 
301     bool Success = scavengeRegs(Intervals);
302     if (!Success) {
303       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
304       if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
305         continue;
306     } else {
307       // Check we did not make it worse for other instructions.
308       auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
309                                 [this](const Candidate &C, SlotIndex I) {
310                                   return LIS->getInstructionIndex(*C.first) < I;
311                                 });
312       for (auto E = Candidates.end(); Success && I != E &&
313               LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
314         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
315           Success = false;
316           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
317         }
318       }
319     }
320 
321     if (!Success) {
322       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
323         if (VRM->hasPhys(Intervals[I]->reg))
324           LRM->unassign(*Intervals[I]);
325 
326       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
327         LRM->assign(*Intervals[I], OrigRegs[I]);
328 
329       continue;
330     }
331 
332     C.second = true;
333     ++NumNSAConverted;
334     LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
335                  << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
336                  << " : "
337                  << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
338                  << "]\n");
339     Changed = true;
340   }
341 
342   return Changed;
343 }
344