xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp (revision 77a1348b3c1cfe8547be49a121b56299a1e18b69)
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveInterval.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/LiveRegMatrix.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/VirtRegMap.h"
26 #include "llvm/InitializePasses.h"
27 #include "llvm/Support/MathExtras.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-nsa-reassign"
33 
34 STATISTIC(NumNSAInstructions,
35           "Number of NSA instructions with non-sequential address found");
36 STATISTIC(NumNSAConverted,
37           "Number of NSA instructions changed to sequential");
38 
39 namespace {
40 
41 class GCNNSAReassign : public MachineFunctionPass {
42 public:
43   static char ID;
44 
45   GCNNSAReassign() : MachineFunctionPass(ID) {
46     initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
47   }
48 
49   bool runOnMachineFunction(MachineFunction &MF) override;
50 
51   StringRef getPassName() const override { return "GCN NSA Reassign"; }
52 
53   void getAnalysisUsage(AnalysisUsage &AU) const override {
54     AU.addRequired<LiveIntervals>();
55     AU.addRequired<VirtRegMap>();
56     AU.addRequired<LiveRegMatrix>();
57     AU.setPreservesAll();
58     MachineFunctionPass::getAnalysisUsage(AU);
59   }
60 
61 private:
62   typedef enum {
63     NOT_NSA,        // Not an NSA instruction
64     FIXED,          // NSA which we cannot modify
65     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
66                     // to optimize.
67     CONTIGUOUS      // NSA with all sequential address registers
68   } NSA_Status;
69 
70   const GCNSubtarget *ST;
71 
72   const MachineRegisterInfo *MRI;
73 
74   const SIRegisterInfo *TRI;
75 
76   VirtRegMap *VRM;
77 
78   LiveRegMatrix *LRM;
79 
80   LiveIntervals *LIS;
81 
82   unsigned MaxNumVGPRs;
83 
84   const MCPhysReg *CSRegs;
85 
86   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
87 
88   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
89                           unsigned StartReg) const;
90 
91   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
92 
93   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
94 };
95 
96 } // End anonymous namespace.
97 
98 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
99                       false, false)
100 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
101 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
102 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
103 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
104                     false, false)
105 
106 
107 char GCNNSAReassign::ID = 0;
108 
109 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
110 
111 bool
112 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
113                                    unsigned StartReg) const {
114   unsigned NumRegs = Intervals.size();
115 
116   for (unsigned N = 0; N < NumRegs; ++N)
117     if (VRM->hasPhys(Intervals[N]->reg))
118       LRM->unassign(*Intervals[N]);
119 
120   for (unsigned N = 0; N < NumRegs; ++N)
121     if (LRM->checkInterference(*Intervals[N], StartReg + N))
122       return false;
123 
124   for (unsigned N = 0; N < NumRegs; ++N)
125     LRM->assign(*Intervals[N], StartReg + N);
126 
127   return true;
128 }
129 
130 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
131   for (unsigned N = 0; N < NumRegs; ++N) {
132     unsigned Reg = StartReg + N;
133     if (!MRI->isAllocatable(Reg))
134       return false;
135 
136     for (unsigned I = 0; CSRegs[I]; ++I)
137       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
138           !LRM->isPhysRegUsed(CSRegs[I]))
139       return false;
140   }
141 
142   return true;
143 }
144 
145 bool
146 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
147   unsigned NumRegs = Intervals.size();
148 
149   if (NumRegs > MaxNumVGPRs)
150     return false;
151   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
152 
153   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
154     if (!canAssign(Reg, NumRegs))
155       continue;
156 
157     if (tryAssignRegisters(Intervals, Reg))
158       return true;
159   }
160 
161   return false;
162 }
163 
164 GCNNSAReassign::NSA_Status
165 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
166   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
167   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
168     return NSA_Status::NOT_NSA;
169 
170   int VAddr0Idx =
171     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
172 
173   unsigned VgprBase = 0;
174   bool NSA = false;
175   for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
176     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
177     Register Reg = Op.getReg();
178     if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
179       return NSA_Status::FIXED;
180 
181     Register PhysReg = VRM->getPhys(Reg);
182 
183     if (!Fast) {
184       if (!PhysReg)
185         return NSA_Status::FIXED;
186 
187       // Bail if address is not a VGPR32. That should be possible to extend the
188       // optimization to work with subregs of a wider register tuples, but the
189       // logic to find free registers will be much more complicated with much
190       // less chances for success. That seems reasonable to assume that in most
191       // cases a tuple is used because a vector variable contains different
192       // parts of an address and it is either already consequitive or cannot
193       // be reassigned if not. If needed it is better to rely on register
194       // coalescer to process such address tuples.
195       if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
196         return NSA_Status::FIXED;
197 
198       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
199 
200       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
201         return NSA_Status::FIXED;
202 
203       for (auto U : MRI->use_nodbg_operands(Reg)) {
204         if (U.isImplicit())
205           return NSA_Status::FIXED;
206         const MachineInstr *UseInst = U.getParent();
207         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
208           return NSA_Status::FIXED;
209       }
210 
211       if (!LIS->hasInterval(Reg))
212         return NSA_Status::FIXED;
213     }
214 
215     if (I == 0)
216       VgprBase = PhysReg;
217     else if (VgprBase + I != PhysReg)
218       NSA = true;
219   }
220 
221   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
222 }
223 
224 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
225   ST = &MF.getSubtarget<GCNSubtarget>();
226   if (ST->getGeneration() < GCNSubtarget::GFX10)
227     return false;
228 
229   MRI = &MF.getRegInfo();
230   TRI = ST->getRegisterInfo();
231   VRM = &getAnalysis<VirtRegMap>();
232   LRM = &getAnalysis<LiveRegMatrix>();
233   LIS = &getAnalysis<LiveIntervals>();
234 
235   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
236   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
237   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
238   CSRegs = MRI->getCalleeSavedRegs();
239 
240   using Candidate = std::pair<const MachineInstr*, bool>;
241   SmallVector<Candidate, 32> Candidates;
242   for (const MachineBasicBlock &MBB : MF) {
243     for (const MachineInstr &MI : MBB) {
244       switch (CheckNSA(MI)) {
245       default:
246         continue;
247       case NSA_Status::CONTIGUOUS:
248         Candidates.push_back(std::make_pair(&MI, true));
249         break;
250       case NSA_Status::NON_CONTIGUOUS:
251         Candidates.push_back(std::make_pair(&MI, false));
252         ++NumNSAInstructions;
253         break;
254       }
255     }
256   }
257 
258   bool Changed = false;
259   for (auto &C : Candidates) {
260     if (C.second)
261       continue;
262 
263     const MachineInstr *MI = C.first;
264     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
265       // Already happen to be fixed.
266       C.second = true;
267       ++NumNSAConverted;
268       continue;
269     }
270 
271     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
272     int VAddr0Idx =
273       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
274 
275     SmallVector<LiveInterval *, 16> Intervals;
276     SmallVector<unsigned, 16> OrigRegs;
277     SlotIndex MinInd, MaxInd;
278     for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
279       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
280       Register Reg = Op.getReg();
281       LiveInterval *LI = &LIS->getInterval(Reg);
282       if (llvm::find(Intervals, LI) != Intervals.end()) {
283         // Same register used, unable to make sequential
284         Intervals.clear();
285         break;
286       }
287       Intervals.push_back(LI);
288       OrigRegs.push_back(VRM->getPhys(Reg));
289       MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
290       MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
291     }
292 
293     if (Intervals.empty())
294       continue;
295 
296     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
297                       << "\tOriginal allocation:\t";
298                for(auto *LI : Intervals)
299                  dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
300                dbgs() << '\n');
301 
302     bool Success = scavengeRegs(Intervals);
303     if (!Success) {
304       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
305       if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
306         continue;
307     } else {
308       // Check we did not make it worse for other instructions.
309       auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
310                                 [this](const Candidate &C, SlotIndex I) {
311                                   return LIS->getInstructionIndex(*C.first) < I;
312                                 });
313       for (auto E = Candidates.end(); Success && I != E &&
314               LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
315         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
316           Success = false;
317           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
318         }
319       }
320     }
321 
322     if (!Success) {
323       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
324         if (VRM->hasPhys(Intervals[I]->reg))
325           LRM->unassign(*Intervals[I]);
326 
327       for (unsigned I = 0; I < Info->VAddrDwords; ++I)
328         LRM->assign(*Intervals[I], OrigRegs[I]);
329 
330       continue;
331     }
332 
333     C.second = true;
334     ++NumNSAConverted;
335     LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
336                  << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
337                  << " : "
338                  << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
339                  << "]\n");
340     Changed = true;
341   }
342 
343   return Changed;
344 }
345