1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential 11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA 12 /// with sequential versions where possible. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "SIInstrInfo.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/LiveInterval.h" 22 #include "llvm/CodeGen/LiveIntervals.h" 23 #include "llvm/CodeGen/LiveRegMatrix.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/VirtRegMap.h" 26 #include "llvm/InitializePasses.h" 27 #include "llvm/Support/MathExtras.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-nsa-reassign" 33 34 STATISTIC(NumNSAInstructions, 35 "Number of NSA instructions with non-sequential address found"); 36 STATISTIC(NumNSAConverted, 37 "Number of NSA instructions changed to sequential"); 38 39 namespace { 40 41 class GCNNSAReassign : public MachineFunctionPass { 42 public: 43 static char ID; 44 45 GCNNSAReassign() : MachineFunctionPass(ID) { 46 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); 47 } 48 49 bool runOnMachineFunction(MachineFunction &MF) override; 50 51 StringRef getPassName() const override { return "GCN NSA Reassign"; } 52 53 void getAnalysisUsage(AnalysisUsage &AU) const override { 54 AU.addRequired<LiveIntervals>(); 55 AU.addRequired<VirtRegMap>(); 56 AU.addRequired<LiveRegMatrix>(); 57 AU.setPreservesAll(); 58 MachineFunctionPass::getAnalysisUsage(AU); 59 } 60 61 private: 62 typedef enum { 63 NOT_NSA, // Not an NSA instruction 64 FIXED, // NSA which we cannot modify 65 NON_CONTIGUOUS, // NSA with non-sequential address which we can try 66 // to optimize. 67 CONTIGUOUS // NSA with all sequential address registers 68 } NSA_Status; 69 70 const GCNSubtarget *ST; 71 72 const MachineRegisterInfo *MRI; 73 74 const SIRegisterInfo *TRI; 75 76 VirtRegMap *VRM; 77 78 LiveRegMatrix *LRM; 79 80 LiveIntervals *LIS; 81 82 unsigned MaxNumVGPRs; 83 84 const MCPhysReg *CSRegs; 85 86 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; 87 88 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, 89 unsigned StartReg) const; 90 91 bool canAssign(unsigned StartReg, unsigned NumRegs) const; 92 93 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; 94 }; 95 96 } // End anonymous namespace. 97 98 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", 99 false, false) 100 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 101 INITIALIZE_PASS_DEPENDENCY(VirtRegMap) 102 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) 103 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", 104 false, false) 105 106 107 char GCNNSAReassign::ID = 0; 108 109 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; 110 111 bool 112 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, 113 unsigned StartReg) const { 114 unsigned NumRegs = Intervals.size(); 115 116 for (unsigned N = 0; N < NumRegs; ++N) 117 if (VRM->hasPhys(Intervals[N]->reg)) 118 LRM->unassign(*Intervals[N]); 119 120 for (unsigned N = 0; N < NumRegs; ++N) 121 if (LRM->checkInterference(*Intervals[N], StartReg + N)) 122 return false; 123 124 for (unsigned N = 0; N < NumRegs; ++N) 125 LRM->assign(*Intervals[N], StartReg + N); 126 127 return true; 128 } 129 130 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { 131 for (unsigned N = 0; N < NumRegs; ++N) { 132 unsigned Reg = StartReg + N; 133 if (!MRI->isAllocatable(Reg)) 134 return false; 135 136 for (unsigned I = 0; CSRegs[I]; ++I) 137 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && 138 !LRM->isPhysRegUsed(CSRegs[I])) 139 return false; 140 } 141 142 return true; 143 } 144 145 bool 146 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { 147 unsigned NumRegs = Intervals.size(); 148 149 if (NumRegs > MaxNumVGPRs) 150 return false; 151 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; 152 153 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { 154 if (!canAssign(Reg, NumRegs)) 155 continue; 156 157 if (tryAssignRegisters(Intervals, Reg)) 158 return true; 159 } 160 161 return false; 162 } 163 164 GCNNSAReassign::NSA_Status 165 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { 166 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 167 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 168 return NSA_Status::NOT_NSA; 169 170 int VAddr0Idx = 171 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 172 173 unsigned VgprBase = 0; 174 bool NSA = false; 175 for (unsigned I = 0; I < Info->VAddrDwords; ++I) { 176 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); 177 Register Reg = Op.getReg(); 178 if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) 179 return NSA_Status::FIXED; 180 181 Register PhysReg = VRM->getPhys(Reg); 182 183 if (!Fast) { 184 if (!PhysReg) 185 return NSA_Status::FIXED; 186 187 // Bail if address is not a VGPR32. That should be possible to extend the 188 // optimization to work with subregs of a wider register tuples, but the 189 // logic to find free registers will be much more complicated with much 190 // less chances for success. That seems reasonable to assume that in most 191 // cases a tuple is used because a vector variable contains different 192 // parts of an address and it is either already consequitive or cannot 193 // be reassigned if not. If needed it is better to rely on register 194 // coalescer to process such address tuples. 195 if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) 196 return NSA_Status::FIXED; 197 198 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 199 200 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) 201 return NSA_Status::FIXED; 202 203 for (auto U : MRI->use_nodbg_operands(Reg)) { 204 if (U.isImplicit()) 205 return NSA_Status::FIXED; 206 const MachineInstr *UseInst = U.getParent(); 207 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) 208 return NSA_Status::FIXED; 209 } 210 211 if (!LIS->hasInterval(Reg)) 212 return NSA_Status::FIXED; 213 } 214 215 if (I == 0) 216 VgprBase = PhysReg; 217 else if (VgprBase + I != PhysReg) 218 NSA = true; 219 } 220 221 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; 222 } 223 224 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { 225 ST = &MF.getSubtarget<GCNSubtarget>(); 226 if (ST->getGeneration() < GCNSubtarget::GFX10) 227 return false; 228 229 MRI = &MF.getRegInfo(); 230 TRI = ST->getRegisterInfo(); 231 VRM = &getAnalysis<VirtRegMap>(); 232 LRM = &getAnalysis<LiveRegMatrix>(); 233 LIS = &getAnalysis<LiveIntervals>(); 234 235 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 236 MaxNumVGPRs = ST->getMaxNumVGPRs(MF); 237 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); 238 CSRegs = MRI->getCalleeSavedRegs(); 239 240 using Candidate = std::pair<const MachineInstr*, bool>; 241 SmallVector<Candidate, 32> Candidates; 242 for (const MachineBasicBlock &MBB : MF) { 243 for (const MachineInstr &MI : MBB) { 244 switch (CheckNSA(MI)) { 245 default: 246 continue; 247 case NSA_Status::CONTIGUOUS: 248 Candidates.push_back(std::make_pair(&MI, true)); 249 break; 250 case NSA_Status::NON_CONTIGUOUS: 251 Candidates.push_back(std::make_pair(&MI, false)); 252 ++NumNSAInstructions; 253 break; 254 } 255 } 256 } 257 258 bool Changed = false; 259 for (auto &C : Candidates) { 260 if (C.second) 261 continue; 262 263 const MachineInstr *MI = C.first; 264 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { 265 // Already happen to be fixed. 266 C.second = true; 267 ++NumNSAConverted; 268 continue; 269 } 270 271 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); 272 int VAddr0Idx = 273 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); 274 275 SmallVector<LiveInterval *, 16> Intervals; 276 SmallVector<unsigned, 16> OrigRegs; 277 SlotIndex MinInd, MaxInd; 278 for (unsigned I = 0; I < Info->VAddrDwords; ++I) { 279 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); 280 Register Reg = Op.getReg(); 281 LiveInterval *LI = &LIS->getInterval(Reg); 282 if (llvm::find(Intervals, LI) != Intervals.end()) { 283 // Same register used, unable to make sequential 284 Intervals.clear(); 285 break; 286 } 287 Intervals.push_back(LI); 288 OrigRegs.push_back(VRM->getPhys(Reg)); 289 if (LI->empty()) { 290 // The address input is undef, so it doesn't contribute to the relevant 291 // range. Seed a reasonable index range if required. 292 if (I == 0) 293 MinInd = MaxInd = LIS->getInstructionIndex(*MI); 294 continue; 295 } 296 MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); 297 MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); 298 } 299 300 if (Intervals.empty()) 301 continue; 302 303 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI 304 << "\tOriginal allocation:\t"; 305 for(auto *LI : Intervals) 306 dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); 307 dbgs() << '\n'); 308 309 bool Success = scavengeRegs(Intervals); 310 if (!Success) { 311 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); 312 if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. 313 continue; 314 } else { 315 // Check we did not make it worse for other instructions. 316 auto I = std::lower_bound(Candidates.begin(), &C, MinInd, 317 [this](const Candidate &C, SlotIndex I) { 318 return LIS->getInstructionIndex(*C.first) < I; 319 }); 320 for (auto E = Candidates.end(); Success && I != E && 321 LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { 322 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { 323 Success = false; 324 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); 325 } 326 } 327 } 328 329 if (!Success) { 330 for (unsigned I = 0; I < Info->VAddrDwords; ++I) 331 if (VRM->hasPhys(Intervals[I]->reg)) 332 LRM->unassign(*Intervals[I]); 333 334 for (unsigned I = 0; I < Info->VAddrDwords; ++I) 335 LRM->assign(*Intervals[I], OrigRegs[I]); 336 337 continue; 338 } 339 340 C.second = true; 341 ++NumNSAConverted; 342 LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" 343 << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) 344 << " : " 345 << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) 346 << "]\n"); 347 Changed = true; 348 } 349 350 return Changed; 351 } 352