1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential 11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA 12 /// with sequential versions where possible. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "GCNSubtarget.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "SIRegisterInfo.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveRegMatrix.h" 23 #include "llvm/CodeGen/MachineFunctionPass.h" 24 #include "llvm/CodeGen/VirtRegMap.h" 25 #include "llvm/InitializePasses.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "amdgpu-nsa-reassign" 30 31 STATISTIC(NumNSAInstructions, 32 "Number of NSA instructions with non-sequential address found"); 33 STATISTIC(NumNSAConverted, 34 "Number of NSA instructions changed to sequential"); 35 36 namespace { 37 38 class GCNNSAReassign : public MachineFunctionPass { 39 public: 40 static char ID; 41 42 GCNNSAReassign() : MachineFunctionPass(ID) { 43 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); 44 } 45 46 bool runOnMachineFunction(MachineFunction &MF) override; 47 48 StringRef getPassName() const override { return "GCN NSA Reassign"; } 49 50 void getAnalysisUsage(AnalysisUsage &AU) const override { 51 AU.addRequired<LiveIntervals>(); 52 AU.addRequired<VirtRegMap>(); 53 AU.addRequired<LiveRegMatrix>(); 54 AU.setPreservesAll(); 55 MachineFunctionPass::getAnalysisUsage(AU); 56 } 57 58 private: 59 typedef enum { 60 NOT_NSA, // Not an NSA instruction 61 FIXED, // NSA which we cannot modify 62 NON_CONTIGUOUS, // NSA with non-sequential address which we can try 63 // to optimize. 64 CONTIGUOUS // NSA with all sequential address registers 65 } NSA_Status; 66 67 const GCNSubtarget *ST; 68 69 const MachineRegisterInfo *MRI; 70 71 const SIRegisterInfo *TRI; 72 73 VirtRegMap *VRM; 74 75 LiveRegMatrix *LRM; 76 77 LiveIntervals *LIS; 78 79 unsigned MaxNumVGPRs; 80 81 const MCPhysReg *CSRegs; 82 83 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; 84 85 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, 86 unsigned StartReg) const; 87 88 bool canAssign(unsigned StartReg, unsigned NumRegs) const; 89 90 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; 91 }; 92 93 } // End anonymous namespace. 94 95 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", 96 false, false) 97 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 98 INITIALIZE_PASS_DEPENDENCY(VirtRegMap) 99 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) 100 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", 101 false, false) 102 103 104 char GCNNSAReassign::ID = 0; 105 106 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; 107 108 bool 109 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, 110 unsigned StartReg) const { 111 unsigned NumRegs = Intervals.size(); 112 113 for (unsigned N = 0; N < NumRegs; ++N) 114 if (VRM->hasPhys(Intervals[N]->reg())) 115 LRM->unassign(*Intervals[N]); 116 117 for (unsigned N = 0; N < NumRegs; ++N) 118 if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N))) 119 return false; 120 121 for (unsigned N = 0; N < NumRegs; ++N) 122 LRM->assign(*Intervals[N], MCRegister::from(StartReg + N)); 123 124 return true; 125 } 126 127 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { 128 for (unsigned N = 0; N < NumRegs; ++N) { 129 unsigned Reg = StartReg + N; 130 if (!MRI->isAllocatable(Reg)) 131 return false; 132 133 for (unsigned I = 0; CSRegs[I]; ++I) 134 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && 135 !LRM->isPhysRegUsed(CSRegs[I])) 136 return false; 137 } 138 139 return true; 140 } 141 142 bool 143 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { 144 unsigned NumRegs = Intervals.size(); 145 146 if (NumRegs > MaxNumVGPRs) 147 return false; 148 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; 149 150 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { 151 if (!canAssign(Reg, NumRegs)) 152 continue; 153 154 if (tryAssignRegisters(Intervals, Reg)) 155 return true; 156 } 157 158 return false; 159 } 160 161 GCNNSAReassign::NSA_Status 162 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { 163 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 164 if (!Info) 165 return NSA_Status::NOT_NSA; 166 167 switch (Info->MIMGEncoding) { 168 case AMDGPU::MIMGEncGfx10NSA: 169 case AMDGPU::MIMGEncGfx11NSA: 170 break; 171 default: 172 return NSA_Status::NOT_NSA; 173 } 174 175 int VAddr0Idx = 176 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 177 178 unsigned VgprBase = 0; 179 bool NSA = false; 180 for (unsigned I = 0; I < Info->VAddrOperands; ++I) { 181 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); 182 Register Reg = Op.getReg(); 183 if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) 184 return NSA_Status::FIXED; 185 186 Register PhysReg = VRM->getPhys(Reg); 187 188 if (!Fast) { 189 if (!PhysReg) 190 return NSA_Status::FIXED; 191 192 // TODO: address the below limitation to handle GFX11 BVH instructions 193 // Bail if address is not a VGPR32. That should be possible to extend the 194 // optimization to work with subregs of a wider register tuples, but the 195 // logic to find free registers will be much more complicated with much 196 // less chances for success. That seems reasonable to assume that in most 197 // cases a tuple is used because a vector variable contains different 198 // parts of an address and it is either already consecutive or cannot 199 // be reassigned if not. If needed it is better to rely on register 200 // coalescer to process such address tuples. 201 if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) 202 return NSA_Status::FIXED; 203 204 // InlineSpiller does not call LRM::assign() after an LI split leaving 205 // it in an inconsistent state, so we cannot call LRM::unassign(). 206 // See llvm bug #48911. 207 // Skip reassign if a register has originated from such split. 208 // FIXME: Remove the workaround when bug #48911 is fixed. 209 if (VRM->getPreSplitReg(Reg)) 210 return NSA_Status::FIXED; 211 212 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 213 214 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) 215 return NSA_Status::FIXED; 216 217 for (auto U : MRI->use_nodbg_operands(Reg)) { 218 if (U.isImplicit()) 219 return NSA_Status::FIXED; 220 const MachineInstr *UseInst = U.getParent(); 221 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) 222 return NSA_Status::FIXED; 223 } 224 225 if (!LIS->hasInterval(Reg)) 226 return NSA_Status::FIXED; 227 } 228 229 if (I == 0) 230 VgprBase = PhysReg; 231 else if (VgprBase + I != PhysReg) 232 NSA = true; 233 } 234 235 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; 236 } 237 238 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { 239 ST = &MF.getSubtarget<GCNSubtarget>(); 240 if (ST->getGeneration() < GCNSubtarget::GFX10) 241 return false; 242 243 MRI = &MF.getRegInfo(); 244 TRI = ST->getRegisterInfo(); 245 VRM = &getAnalysis<VirtRegMap>(); 246 LRM = &getAnalysis<LiveRegMatrix>(); 247 LIS = &getAnalysis<LiveIntervals>(); 248 249 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 250 MaxNumVGPRs = ST->getMaxNumVGPRs(MF); 251 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); 252 CSRegs = MRI->getCalleeSavedRegs(); 253 254 using Candidate = std::pair<const MachineInstr*, bool>; 255 SmallVector<Candidate, 32> Candidates; 256 for (const MachineBasicBlock &MBB : MF) { 257 for (const MachineInstr &MI : MBB) { 258 switch (CheckNSA(MI)) { 259 default: 260 continue; 261 case NSA_Status::CONTIGUOUS: 262 Candidates.push_back(std::make_pair(&MI, true)); 263 break; 264 case NSA_Status::NON_CONTIGUOUS: 265 Candidates.push_back(std::make_pair(&MI, false)); 266 ++NumNSAInstructions; 267 break; 268 } 269 } 270 } 271 272 bool Changed = false; 273 for (auto &C : Candidates) { 274 if (C.second) 275 continue; 276 277 const MachineInstr *MI = C.first; 278 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { 279 // Already happen to be fixed. 280 C.second = true; 281 ++NumNSAConverted; 282 continue; 283 } 284 285 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); 286 int VAddr0Idx = 287 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); 288 289 SmallVector<LiveInterval *, 16> Intervals; 290 SmallVector<MCRegister, 16> OrigRegs; 291 SlotIndex MinInd, MaxInd; 292 for (unsigned I = 0; I < Info->VAddrOperands; ++I) { 293 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); 294 Register Reg = Op.getReg(); 295 LiveInterval *LI = &LIS->getInterval(Reg); 296 if (llvm::is_contained(Intervals, LI)) { 297 // Same register used, unable to make sequential 298 Intervals.clear(); 299 break; 300 } 301 Intervals.push_back(LI); 302 OrigRegs.push_back(VRM->getPhys(Reg)); 303 if (LI->empty()) { 304 // The address input is undef, so it doesn't contribute to the relevant 305 // range. Seed a reasonable index range if required. 306 if (I == 0) 307 MinInd = MaxInd = LIS->getInstructionIndex(*MI); 308 continue; 309 } 310 MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); 311 MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); 312 } 313 314 if (Intervals.empty()) 315 continue; 316 317 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI 318 << "\tOriginal allocation:\t"; 319 for (auto *LI 320 : Intervals) dbgs() 321 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI); 322 dbgs() << '\n'); 323 324 bool Success = scavengeRegs(Intervals); 325 if (!Success) { 326 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); 327 if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation. 328 continue; 329 } else { 330 // Check we did not make it worse for other instructions. 331 auto I = std::lower_bound(Candidates.begin(), &C, MinInd, 332 [this](const Candidate &C, SlotIndex I) { 333 return LIS->getInstructionIndex(*C.first) < I; 334 }); 335 for (auto E = Candidates.end(); Success && I != E && 336 LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { 337 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { 338 Success = false; 339 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); 340 } 341 } 342 } 343 344 if (!Success) { 345 for (unsigned I = 0; I < Info->VAddrOperands; ++I) 346 if (VRM->hasPhys(Intervals[I]->reg())) 347 LRM->unassign(*Intervals[I]); 348 349 for (unsigned I = 0; I < Info->VAddrOperands; ++I) 350 LRM->assign(*Intervals[I], OrigRegs[I]); 351 352 continue; 353 } 354 355 C.second = true; 356 ++NumNSAConverted; 357 LLVM_DEBUG( 358 dbgs() << "\tNew allocation:\t\t [" 359 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI) 360 << " : " 361 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI) 362 << "]\n"); 363 Changed = true; 364 } 365 366 return Changed; 367 } 368