1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential 11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA 12 /// with sequential versions where possible. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "GCNNSAReassign.h" 17 #include "AMDGPU.h" 18 #include "GCNSubtarget.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/CodeGen/LiveIntervals.h" 23 #include "llvm/CodeGen/LiveRegMatrix.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/VirtRegMap.h" 26 #include "llvm/InitializePasses.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "amdgpu-nsa-reassign" 31 32 STATISTIC(NumNSAInstructions, 33 "Number of NSA instructions with non-sequential address found"); 34 STATISTIC(NumNSAConverted, 35 "Number of NSA instructions changed to sequential"); 36 37 namespace { 38 class GCNNSAReassignImpl { 39 public: 40 GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS) 41 : VRM(VM), LRM(LM), LIS(LS) {} 42 43 bool run(MachineFunction &MF); 44 45 private: 46 using NSA_Status = enum { 47 NOT_NSA, // Not an NSA instruction 48 FIXED, // NSA which we cannot modify 49 NON_CONTIGUOUS, // NSA with non-sequential address which we can try 50 // to optimize. 51 CONTIGUOUS // NSA with all sequential address registers 52 }; 53 54 const GCNSubtarget *ST; 55 56 const MachineRegisterInfo *MRI; 57 58 const SIRegisterInfo *TRI; 59 60 VirtRegMap *VRM; 61 62 LiveRegMatrix *LRM; 63 64 LiveIntervals *LIS; 65 66 unsigned MaxNumVGPRs; 67 68 const MCPhysReg *CSRegs; 69 70 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; 71 72 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, 73 unsigned StartReg) const; 74 75 bool canAssign(unsigned StartReg, unsigned NumRegs) const; 76 77 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; 78 }; 79 80 class GCNNSAReassignLegacy : public MachineFunctionPass { 81 public: 82 static char ID; 83 84 GCNNSAReassignLegacy() : MachineFunctionPass(ID) { 85 initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry()); 86 } 87 88 bool runOnMachineFunction(MachineFunction &MF) override; 89 90 StringRef getPassName() const override { return "GCN NSA Reassign"; }; 91 92 void getAnalysisUsage(AnalysisUsage &AU) const override { 93 AU.addRequired<LiveIntervalsWrapperPass>(); 94 AU.addRequired<VirtRegMapWrapperLegacy>(); 95 AU.addRequired<LiveRegMatrixWrapperLegacy>(); 96 AU.setPreservesAll(); 97 MachineFunctionPass::getAnalysisUsage(AU); 98 } 99 }; 100 101 } // End anonymous namespace. 102 103 INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", 104 false, false) 105 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) 106 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) 107 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) 108 INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false, 109 false) 110 111 char GCNNSAReassignLegacy::ID = 0; 112 113 char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID; 114 115 bool GCNNSAReassignImpl::tryAssignRegisters( 116 SmallVectorImpl<LiveInterval *> &Intervals, unsigned StartReg) const { 117 unsigned NumRegs = Intervals.size(); 118 119 for (unsigned N = 0; N < NumRegs; ++N) 120 if (VRM->hasPhys(Intervals[N]->reg())) 121 LRM->unassign(*Intervals[N]); 122 123 for (unsigned N = 0; N < NumRegs; ++N) 124 if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N))) 125 return false; 126 127 for (unsigned N = 0; N < NumRegs; ++N) 128 LRM->assign(*Intervals[N], MCRegister::from(StartReg + N)); 129 130 return true; 131 } 132 133 bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const { 134 for (unsigned N = 0; N < NumRegs; ++N) { 135 unsigned Reg = StartReg + N; 136 if (!MRI->isAllocatable(Reg)) 137 return false; 138 139 for (unsigned I = 0; CSRegs[I]; ++I) 140 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && 141 !LRM->isPhysRegUsed(CSRegs[I])) 142 return false; 143 } 144 145 return true; 146 } 147 148 bool GCNNSAReassignImpl::scavengeRegs( 149 SmallVectorImpl<LiveInterval *> &Intervals) const { 150 unsigned NumRegs = Intervals.size(); 151 152 if (NumRegs > MaxNumVGPRs) 153 return false; 154 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; 155 156 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { 157 if (!canAssign(Reg, NumRegs)) 158 continue; 159 160 if (tryAssignRegisters(Intervals, Reg)) 161 return true; 162 } 163 164 return false; 165 } 166 167 GCNNSAReassignImpl::NSA_Status 168 GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const { 169 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 170 if (!Info) 171 return NSA_Status::NOT_NSA; 172 173 switch (Info->MIMGEncoding) { 174 case AMDGPU::MIMGEncGfx10NSA: 175 case AMDGPU::MIMGEncGfx11NSA: 176 break; 177 default: 178 return NSA_Status::NOT_NSA; 179 } 180 181 int VAddr0Idx = 182 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 183 184 unsigned VgprBase = 0; 185 bool NSA = false; 186 for (unsigned I = 0; I < Info->VAddrOperands; ++I) { 187 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); 188 Register Reg = Op.getReg(); 189 if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) 190 return NSA_Status::FIXED; 191 192 Register PhysReg = VRM->getPhys(Reg); 193 194 if (!Fast) { 195 if (!PhysReg) 196 return NSA_Status::FIXED; 197 198 // TODO: address the below limitation to handle GFX11 BVH instructions 199 // Bail if address is not a VGPR32. That should be possible to extend the 200 // optimization to work with subregs of a wider register tuples, but the 201 // logic to find free registers will be much more complicated with much 202 // less chances for success. That seems reasonable to assume that in most 203 // cases a tuple is used because a vector variable contains different 204 // parts of an address and it is either already consecutive or cannot 205 // be reassigned if not. If needed it is better to rely on register 206 // coalescer to process such address tuples. 207 if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) 208 return NSA_Status::FIXED; 209 210 // InlineSpiller does not call LRM::assign() after an LI split leaving 211 // it in an inconsistent state, so we cannot call LRM::unassign(). 212 // See llvm bug #48911. 213 // Skip reassign if a register has originated from such split. 214 // FIXME: Remove the workaround when bug #48911 is fixed. 215 if (VRM->getPreSplitReg(Reg)) 216 return NSA_Status::FIXED; 217 218 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 219 220 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) 221 return NSA_Status::FIXED; 222 223 for (auto U : MRI->use_nodbg_operands(Reg)) { 224 if (U.isImplicit()) 225 return NSA_Status::FIXED; 226 const MachineInstr *UseInst = U.getParent(); 227 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) 228 return NSA_Status::FIXED; 229 } 230 231 if (!LIS->hasInterval(Reg)) 232 return NSA_Status::FIXED; 233 } 234 235 if (I == 0) 236 VgprBase = PhysReg; 237 else if (VgprBase + I != PhysReg) 238 NSA = true; 239 } 240 241 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; 242 } 243 244 bool GCNNSAReassignImpl::run(MachineFunction &MF) { 245 ST = &MF.getSubtarget<GCNSubtarget>(); 246 if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) 247 return false; 248 249 MRI = &MF.getRegInfo(); 250 TRI = ST->getRegisterInfo(); 251 252 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 253 MaxNumVGPRs = ST->getMaxNumVGPRs(MF); 254 MaxNumVGPRs = std::min( 255 ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()), 256 MaxNumVGPRs); 257 CSRegs = MRI->getCalleeSavedRegs(); 258 259 using Candidate = std::pair<const MachineInstr*, bool>; 260 SmallVector<Candidate, 32> Candidates; 261 for (const MachineBasicBlock &MBB : MF) { 262 for (const MachineInstr &MI : MBB) { 263 switch (CheckNSA(MI)) { 264 default: 265 continue; 266 case NSA_Status::CONTIGUOUS: 267 Candidates.push_back(std::pair(&MI, true)); 268 break; 269 case NSA_Status::NON_CONTIGUOUS: 270 Candidates.push_back(std::pair(&MI, false)); 271 ++NumNSAInstructions; 272 break; 273 } 274 } 275 } 276 277 bool Changed = false; 278 for (auto &C : Candidates) { 279 if (C.second) 280 continue; 281 282 const MachineInstr *MI = C.first; 283 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { 284 // Already happen to be fixed. 285 C.second = true; 286 ++NumNSAConverted; 287 continue; 288 } 289 290 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); 291 int VAddr0Idx = 292 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); 293 294 SmallVector<LiveInterval *, 16> Intervals; 295 SmallVector<MCRegister, 16> OrigRegs; 296 SlotIndex MinInd, MaxInd; 297 for (unsigned I = 0; I < Info->VAddrOperands; ++I) { 298 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); 299 Register Reg = Op.getReg(); 300 LiveInterval *LI = &LIS->getInterval(Reg); 301 if (llvm::is_contained(Intervals, LI)) { 302 // Same register used, unable to make sequential 303 Intervals.clear(); 304 break; 305 } 306 Intervals.push_back(LI); 307 OrigRegs.push_back(VRM->getPhys(Reg)); 308 if (LI->empty()) { 309 // The address input is undef, so it doesn't contribute to the relevant 310 // range. Seed a reasonable index range if required. 311 if (I == 0) 312 MinInd = MaxInd = LIS->getInstructionIndex(*MI); 313 continue; 314 } 315 MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); 316 MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); 317 } 318 319 if (Intervals.empty()) 320 continue; 321 322 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI 323 << "\tOriginal allocation:\t"; 324 for (auto *LI 325 : Intervals) dbgs() 326 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI); 327 dbgs() << '\n'); 328 329 bool Success = scavengeRegs(Intervals); 330 if (!Success) { 331 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); 332 if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation. 333 continue; 334 } else { 335 // Check we did not make it worse for other instructions. 336 auto *I = 337 std::lower_bound(Candidates.begin(), &C, MinInd, 338 [this](const Candidate &C, SlotIndex I) { 339 return LIS->getInstructionIndex(*C.first) < I; 340 }); 341 for (auto *E = Candidates.end(); 342 Success && I != E && LIS->getInstructionIndex(*I->first) < MaxInd; 343 ++I) { 344 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { 345 Success = false; 346 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); 347 } 348 } 349 } 350 351 if (!Success) { 352 for (unsigned I = 0; I < Info->VAddrOperands; ++I) 353 if (VRM->hasPhys(Intervals[I]->reg())) 354 LRM->unassign(*Intervals[I]); 355 356 for (unsigned I = 0; I < Info->VAddrOperands; ++I) 357 LRM->assign(*Intervals[I], OrigRegs[I]); 358 359 continue; 360 } 361 362 C.second = true; 363 ++NumNSAConverted; 364 LLVM_DEBUG( 365 dbgs() << "\tNew allocation:\t\t [" 366 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI) 367 << " : " 368 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI) 369 << "]\n"); 370 Changed = true; 371 } 372 373 return Changed; 374 } 375 376 bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) { 377 auto *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM(); 378 auto *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM(); 379 auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); 380 381 GCNNSAReassignImpl Impl(VRM, LRM, LIS); 382 return Impl.run(MF); 383 } 384 385 PreservedAnalyses 386 GCNNSAReassignPass::run(MachineFunction &MF, 387 MachineFunctionAnalysisManager &MFAM) { 388 auto &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF); 389 auto &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF); 390 auto &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF); 391 392 GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS); 393 Impl.run(MF); 394 return PreservedAnalyses::all(); 395 } 396