xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "GCNNSAReassign.h"
17 #include "AMDGPU.h"
18 #include "GCNSubtarget.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/LiveRegMatrix.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/VirtRegMap.h"
26 #include "llvm/InitializePasses.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "amdgpu-nsa-reassign"
31 
32 STATISTIC(NumNSAInstructions,
33           "Number of NSA instructions with non-sequential address found");
34 STATISTIC(NumNSAConverted,
35           "Number of NSA instructions changed to sequential");
36 
37 namespace {
38 class GCNNSAReassignImpl {
39 public:
GCNNSAReassignImpl(VirtRegMap * VM,LiveRegMatrix * LM,LiveIntervals * LS)40   GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS)
41       : VRM(VM), LRM(LM), LIS(LS) {}
42 
43   bool run(MachineFunction &MF);
44 
45 private:
46   using NSA_Status = enum {
47     NOT_NSA,        // Not an NSA instruction
48     FIXED,          // NSA which we cannot modify
49     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
50                     // to optimize.
51     CONTIGUOUS      // NSA with all sequential address registers
52   };
53 
54   const GCNSubtarget *ST;
55 
56   const MachineRegisterInfo *MRI;
57 
58   const SIRegisterInfo *TRI;
59 
60   VirtRegMap *VRM;
61 
62   LiveRegMatrix *LRM;
63 
64   LiveIntervals *LIS;
65 
66   unsigned MaxNumVGPRs;
67 
68   const MCPhysReg *CSRegs;
69 
70   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
71 
72   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
73                           unsigned StartReg) const;
74 
75   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
76 
77   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
78 };
79 
80 class GCNNSAReassignLegacy : public MachineFunctionPass {
81 public:
82   static char ID;
83 
GCNNSAReassignLegacy()84   GCNNSAReassignLegacy() : MachineFunctionPass(ID) {
85     initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry());
86   }
87 
88   bool runOnMachineFunction(MachineFunction &MF) override;
89 
getPassName() const90   StringRef getPassName() const override { return "GCN NSA Reassign"; };
91 
getAnalysisUsage(AnalysisUsage & AU) const92   void getAnalysisUsage(AnalysisUsage &AU) const override {
93     AU.addRequired<LiveIntervalsWrapperPass>();
94     AU.addRequired<VirtRegMapWrapperLegacy>();
95     AU.addRequired<LiveRegMatrixWrapperLegacy>();
96     AU.setPreservesAll();
97     MachineFunctionPass::getAnalysisUsage(AU);
98   }
99 };
100 
101 } // End anonymous namespace.
102 
103 INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign",
104                       false, false)
105 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
106 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
107 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
108 INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false,
109                     false)
110 
111 char GCNNSAReassignLegacy::ID = 0;
112 
113 char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID;
114 
tryAssignRegisters(SmallVectorImpl<LiveInterval * > & Intervals,unsigned StartReg) const115 bool GCNNSAReassignImpl::tryAssignRegisters(
116     SmallVectorImpl<LiveInterval *> &Intervals, unsigned StartReg) const {
117   unsigned NumRegs = Intervals.size();
118 
119   for (unsigned N = 0; N < NumRegs; ++N)
120     if (VRM->hasPhys(Intervals[N]->reg()))
121       LRM->unassign(*Intervals[N]);
122 
123   for (unsigned N = 0; N < NumRegs; ++N)
124     if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
125       return false;
126 
127   for (unsigned N = 0; N < NumRegs; ++N)
128     LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
129 
130   return true;
131 }
132 
canAssign(unsigned StartReg,unsigned NumRegs) const133 bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const {
134   for (unsigned N = 0; N < NumRegs; ++N) {
135     unsigned Reg = StartReg + N;
136     if (!MRI->isAllocatable(Reg))
137       return false;
138 
139     for (unsigned I = 0; CSRegs[I]; ++I)
140       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
141           !LRM->isPhysRegUsed(CSRegs[I]))
142       return false;
143   }
144 
145   return true;
146 }
147 
scavengeRegs(SmallVectorImpl<LiveInterval * > & Intervals) const148 bool GCNNSAReassignImpl::scavengeRegs(
149     SmallVectorImpl<LiveInterval *> &Intervals) const {
150   unsigned NumRegs = Intervals.size();
151 
152   if (NumRegs > MaxNumVGPRs)
153     return false;
154   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
155 
156   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
157     if (!canAssign(Reg, NumRegs))
158       continue;
159 
160     if (tryAssignRegisters(Intervals, Reg))
161       return true;
162   }
163 
164   return false;
165 }
166 
167 GCNNSAReassignImpl::NSA_Status
CheckNSA(const MachineInstr & MI,bool Fast) const168 GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const {
169   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
170   if (!Info)
171     return NSA_Status::NOT_NSA;
172 
173   switch (Info->MIMGEncoding) {
174   case AMDGPU::MIMGEncGfx10NSA:
175   case AMDGPU::MIMGEncGfx11NSA:
176     break;
177   default:
178     return NSA_Status::NOT_NSA;
179   }
180 
181   int VAddr0Idx =
182     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
183 
184   unsigned VgprBase = 0;
185   bool NSA = false;
186   for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
187     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
188     Register Reg = Op.getReg();
189     if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
190       return NSA_Status::FIXED;
191 
192     Register PhysReg = VRM->getPhys(Reg);
193 
194     if (!Fast) {
195       if (!PhysReg)
196         return NSA_Status::FIXED;
197 
198       // TODO: address the below limitation to handle GFX11 BVH instructions
199       // Bail if address is not a VGPR32. That should be possible to extend the
200       // optimization to work with subregs of a wider register tuples, but the
201       // logic to find free registers will be much more complicated with much
202       // less chances for success. That seems reasonable to assume that in most
203       // cases a tuple is used because a vector variable contains different
204       // parts of an address and it is either already consecutive or cannot
205       // be reassigned if not. If needed it is better to rely on register
206       // coalescer to process such address tuples.
207       if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
208         return NSA_Status::FIXED;
209 
210       // InlineSpiller does not call LRM::assign() after an LI split leaving
211       // it in an inconsistent state, so we cannot call LRM::unassign().
212       // See llvm bug #48911.
213       // Skip reassign if a register has originated from such split.
214       // FIXME: Remove the workaround when bug #48911 is fixed.
215       if (VRM->getPreSplitReg(Reg))
216         return NSA_Status::FIXED;
217 
218       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
219 
220       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
221         return NSA_Status::FIXED;
222 
223       for (auto U : MRI->use_nodbg_operands(Reg)) {
224         if (U.isImplicit())
225           return NSA_Status::FIXED;
226         const MachineInstr *UseInst = U.getParent();
227         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
228           return NSA_Status::FIXED;
229       }
230 
231       if (!LIS->hasInterval(Reg))
232         return NSA_Status::FIXED;
233     }
234 
235     if (I == 0)
236       VgprBase = PhysReg;
237     else if (VgprBase + I != PhysReg)
238       NSA = true;
239   }
240 
241   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
242 }
243 
run(MachineFunction & MF)244 bool GCNNSAReassignImpl::run(MachineFunction &MF) {
245   ST = &MF.getSubtarget<GCNSubtarget>();
246   if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())
247     return false;
248 
249   MRI = &MF.getRegInfo();
250   TRI = ST->getRegisterInfo();
251 
252   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
253   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
254   MaxNumVGPRs = std::min(
255       ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
256       MaxNumVGPRs);
257   CSRegs = MRI->getCalleeSavedRegs();
258 
259   using Candidate = std::pair<const MachineInstr*, bool>;
260   SmallVector<Candidate, 32> Candidates;
261   for (const MachineBasicBlock &MBB : MF) {
262     for (const MachineInstr &MI : MBB) {
263       switch (CheckNSA(MI)) {
264       default:
265         continue;
266       case NSA_Status::CONTIGUOUS:
267         Candidates.push_back(std::pair(&MI, true));
268         break;
269       case NSA_Status::NON_CONTIGUOUS:
270         Candidates.push_back(std::pair(&MI, false));
271         ++NumNSAInstructions;
272         break;
273       }
274     }
275   }
276 
277   bool Changed = false;
278   for (auto &C : Candidates) {
279     if (C.second)
280       continue;
281 
282     const MachineInstr *MI = C.first;
283     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
284       // Already happen to be fixed.
285       C.second = true;
286       ++NumNSAConverted;
287       continue;
288     }
289 
290     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
291     int VAddr0Idx =
292       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
293 
294     SmallVector<LiveInterval *, 16> Intervals;
295     SmallVector<MCRegister, 16> OrigRegs;
296     SlotIndex MinInd, MaxInd;
297     for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
298       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
299       Register Reg = Op.getReg();
300       LiveInterval *LI = &LIS->getInterval(Reg);
301       if (llvm::is_contained(Intervals, LI)) {
302         // Same register used, unable to make sequential
303         Intervals.clear();
304         break;
305       }
306       Intervals.push_back(LI);
307       OrigRegs.push_back(VRM->getPhys(Reg));
308       if (LI->empty()) {
309         // The address input is undef, so it doesn't contribute to the relevant
310         // range. Seed a reasonable index range if required.
311         if (I == 0)
312           MinInd = MaxInd = LIS->getInstructionIndex(*MI);
313         continue;
314       }
315       MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
316       MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
317     }
318 
319     if (Intervals.empty())
320       continue;
321 
322     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
323                       << "\tOriginal allocation:\t";
324                for (auto *LI
325                     : Intervals) dbgs()
326                << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
327                dbgs() << '\n');
328 
329     bool Success = scavengeRegs(Intervals);
330     if (!Success) {
331       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
332       if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
333         continue;
334     } else {
335       // Check we did not make it worse for other instructions.
336       auto *I =
337           std::lower_bound(Candidates.begin(), &C, MinInd,
338                            [this](const Candidate &C, SlotIndex I) {
339                              return LIS->getInstructionIndex(*C.first) < I;
340                            });
341       for (auto *E = Candidates.end();
342            Success && I != E && LIS->getInstructionIndex(*I->first) < MaxInd;
343            ++I) {
344         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
345           Success = false;
346           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
347         }
348       }
349     }
350 
351     if (!Success) {
352       for (unsigned I = 0; I < Info->VAddrOperands; ++I)
353         if (VRM->hasPhys(Intervals[I]->reg()))
354           LRM->unassign(*Intervals[I]);
355 
356       for (unsigned I = 0; I < Info->VAddrOperands; ++I)
357         LRM->assign(*Intervals[I], OrigRegs[I]);
358 
359       continue;
360     }
361 
362     C.second = true;
363     ++NumNSAConverted;
364     LLVM_DEBUG(
365         dbgs() << "\tNew allocation:\t\t ["
366                << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
367                << " : "
368                << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
369                << "]\n");
370     Changed = true;
371   }
372 
373   return Changed;
374 }
375 
runOnMachineFunction(MachineFunction & MF)376 bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) {
377   auto *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
378   auto *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
379   auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
380 
381   GCNNSAReassignImpl Impl(VRM, LRM, LIS);
382   return Impl.run(MF);
383 }
384 
385 PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)386 GCNNSAReassignPass::run(MachineFunction &MF,
387                         MachineFunctionAnalysisManager &MFAM) {
388   auto &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
389   auto &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
390   auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
391 
392   GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS);
393   Impl.run(MF);
394   return PreservedAnalyses::all();
395 }
396