1 //===-- SIFormMemoryClauses.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass extends the live ranges of registers used as pointers in 10 /// sequences of adjacent SMEM and VMEM instructions if XNACK is enabled. A 11 /// load that would overwrite a pointer would require breaking the soft clause. 12 /// Artificially extend the live ranges of the pointer operands by adding 13 /// implicit-def early-clobber operands throughout the soft clause. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "SIFormMemoryClauses.h" 18 #include "AMDGPU.h" 19 #include "GCNRegPressure.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/InitializePasses.h" 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "si-form-memory-clauses" 26 27 // Clauses longer then 15 instructions would overflow one of the counters 28 // and stall. They can stall even earlier if there are outstanding counters. 29 static cl::opt<unsigned> 30 MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15), 31 cl::desc("Maximum length of a memory clause, instructions")); 32 33 namespace { 34 35 class SIFormMemoryClausesImpl { 36 using RegUse = DenseMap<unsigned, std::pair<unsigned, LaneBitmask>>; 37 38 bool canBundle(const MachineInstr &MI, const RegUse &Defs, 39 const RegUse &Uses) const; 40 bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT); 41 void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; 42 bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses, 43 GCNDownwardRPTracker &RPT); 44 45 const GCNSubtarget *ST; 46 const SIRegisterInfo *TRI; 47 const MachineRegisterInfo *MRI; 48 SIMachineFunctionInfo *MFI; 49 LiveIntervals *LIS; 50 51 unsigned LastRecordedOccupancy; 52 unsigned MaxVGPRs; 53 unsigned MaxSGPRs; 54 55 public: 56 SIFormMemoryClausesImpl(LiveIntervals *LS) : LIS(LS) {} 57 bool run(MachineFunction &MF); 58 }; 59 60 class SIFormMemoryClausesLegacy : public MachineFunctionPass { 61 public: 62 static char ID; 63 64 SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) { 65 initializeSIFormMemoryClausesLegacyPass(*PassRegistry::getPassRegistry()); 66 } 67 68 bool runOnMachineFunction(MachineFunction &MF) override; 69 70 StringRef getPassName() const override { 71 return "SI Form memory clauses"; 72 } 73 74 void getAnalysisUsage(AnalysisUsage &AU) const override { 75 AU.addRequired<LiveIntervalsWrapperPass>(); 76 AU.setPreservesAll(); 77 MachineFunctionPass::getAnalysisUsage(AU); 78 } 79 80 MachineFunctionProperties getClearedProperties() const override { 81 return MachineFunctionProperties().setIsSSA(); 82 } 83 }; 84 85 } // End anonymous namespace. 86 87 INITIALIZE_PASS_BEGIN(SIFormMemoryClausesLegacy, DEBUG_TYPE, 88 "SI Form memory clauses", false, false) 89 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) 90 INITIALIZE_PASS_END(SIFormMemoryClausesLegacy, DEBUG_TYPE, 91 "SI Form memory clauses", false, false) 92 93 char SIFormMemoryClausesLegacy::ID = 0; 94 95 char &llvm::SIFormMemoryClausesID = SIFormMemoryClausesLegacy::ID; 96 97 FunctionPass *llvm::createSIFormMemoryClausesLegacyPass() { 98 return new SIFormMemoryClausesLegacy(); 99 } 100 101 static bool isVMEMClauseInst(const MachineInstr &MI) { 102 return SIInstrInfo::isVMEM(MI); 103 } 104 105 static bool isSMEMClauseInst(const MachineInstr &MI) { 106 return SIInstrInfo::isSMRD(MI); 107 } 108 109 // There no sense to create store clauses, they do not define anything, 110 // thus there is nothing to set early-clobber. 111 static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { 112 assert(!MI.isDebugInstr() && "debug instructions should not reach here"); 113 if (MI.isBundled()) 114 return false; 115 if (!MI.mayLoad() || MI.mayStore()) 116 return false; 117 if (SIInstrInfo::isAtomic(MI)) 118 return false; 119 if (IsVMEMClause && !isVMEMClauseInst(MI)) 120 return false; 121 if (!IsVMEMClause && !isSMEMClauseInst(MI)) 122 return false; 123 // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. 124 for (const MachineOperand &ResMO : MI.defs()) { 125 Register ResReg = ResMO.getReg(); 126 for (const MachineOperand &MO : MI.all_uses()) { 127 if (MO.getReg() == ResReg) 128 return false; 129 } 130 break; // Only check the first def. 131 } 132 return true; 133 } 134 135 static unsigned getMopState(const MachineOperand &MO) { 136 unsigned S = 0; 137 if (MO.isImplicit()) 138 S |= RegState::Implicit; 139 if (MO.isDead()) 140 S |= RegState::Dead; 141 if (MO.isUndef()) 142 S |= RegState::Undef; 143 if (MO.isKill()) 144 S |= RegState::Kill; 145 if (MO.isEarlyClobber()) 146 S |= RegState::EarlyClobber; 147 if (MO.getReg().isPhysical() && MO.isRenamable()) 148 S |= RegState::Renamable; 149 return S; 150 } 151 152 // Returns false if there is a use of a def already in the map. 153 // In this case we must break the clause. 154 bool SIFormMemoryClausesImpl::canBundle(const MachineInstr &MI, 155 const RegUse &Defs, 156 const RegUse &Uses) const { 157 // Check interference with defs. 158 for (const MachineOperand &MO : MI.operands()) { 159 // TODO: Prologue/Epilogue Insertion pass does not process bundled 160 // instructions. 161 if (MO.isFI()) 162 return false; 163 164 if (!MO.isReg()) 165 continue; 166 167 Register Reg = MO.getReg(); 168 169 // If it is tied we will need to write same register as we read. 170 if (MO.isTied()) 171 return false; 172 173 const RegUse &Map = MO.isDef() ? Uses : Defs; 174 auto Conflict = Map.find(Reg); 175 if (Conflict == Map.end()) 176 continue; 177 178 if (Reg.isPhysical()) 179 return false; 180 181 LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); 182 if ((Conflict->second.second & Mask).any()) 183 return false; 184 } 185 186 return true; 187 } 188 189 // Since all defs in the clause are early clobber we can run out of registers. 190 // Function returns false if pressure would hit the limit if instruction is 191 // bundled into a memory clause. 192 bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI, 193 GCNDownwardRPTracker &RPT) { 194 // NB: skip advanceBeforeNext() call. Since all defs will be marked 195 // early-clobber they will all stay alive at least to the end of the 196 // clause. Therefor we should not decrease pressure even if load 197 // pointer becomes dead and could otherwise be reused for destination. 198 RPT.advanceToNext(); 199 GCNRegPressure MaxPressure = RPT.moveMaxPressure(); 200 unsigned Occupancy = MaxPressure.getOccupancy( 201 *ST, 202 MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()); 203 204 // Don't push over half the register budget. We don't want to introduce 205 // spilling just to form a soft clause. 206 // 207 // FIXME: This pressure check is fundamentally broken. First, this is checking 208 // the global pressure, not the pressure at this specific point in the 209 // program. Second, it's not accounting for the increased liveness of the use 210 // operands due to the early clobber we will introduce. Third, the pressure 211 // tracking does not account for the alignment requirements for SGPRs, or the 212 // fragmentation of registers the allocator will need to satisfy. 213 if (Occupancy >= MFI->getMinAllowedOccupancy() && 214 MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 && 215 MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { 216 LastRecordedOccupancy = Occupancy; 217 return true; 218 } 219 return false; 220 } 221 222 // Collect register defs and uses along with their lane masks and states. 223 void SIFormMemoryClausesImpl::collectRegUses(const MachineInstr &MI, 224 RegUse &Defs, RegUse &Uses) const { 225 for (const MachineOperand &MO : MI.operands()) { 226 if (!MO.isReg()) 227 continue; 228 Register Reg = MO.getReg(); 229 if (!Reg) 230 continue; 231 232 LaneBitmask Mask = Reg.isVirtual() 233 ? TRI->getSubRegIndexLaneMask(MO.getSubReg()) 234 : LaneBitmask::getAll(); 235 RegUse &Map = MO.isDef() ? Defs : Uses; 236 237 unsigned State = getMopState(MO); 238 auto [Loc, Inserted] = Map.try_emplace(Reg, State, Mask); 239 if (!Inserted) { 240 Loc->second.first |= State; 241 Loc->second.second |= Mask; 242 } 243 } 244 } 245 246 // Check register def/use conflicts, occupancy limits and collect def/use maps. 247 // Return true if instruction can be bundled with previous. If it cannot 248 // def/use maps are not updated. 249 bool SIFormMemoryClausesImpl::processRegUses(const MachineInstr &MI, 250 RegUse &Defs, RegUse &Uses, 251 GCNDownwardRPTracker &RPT) { 252 if (!canBundle(MI, Defs, Uses)) 253 return false; 254 255 if (!checkPressure(MI, RPT)) 256 return false; 257 258 collectRegUses(MI, Defs, Uses); 259 return true; 260 } 261 262 bool SIFormMemoryClausesImpl::run(MachineFunction &MF) { 263 ST = &MF.getSubtarget<GCNSubtarget>(); 264 if (!ST->isXNACKEnabled()) 265 return false; 266 267 const SIInstrInfo *TII = ST->getInstrInfo(); 268 TRI = ST->getRegisterInfo(); 269 MRI = &MF.getRegInfo(); 270 MFI = MF.getInfo<SIMachineFunctionInfo>(); 271 SlotIndexes *Ind = LIS->getSlotIndexes(); 272 bool Changed = false; 273 274 MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count(); 275 MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count(); 276 unsigned FuncMaxClause = MF.getFunction().getFnAttributeAsParsedInteger( 277 "amdgpu-max-memory-clause", MaxClause); 278 279 for (MachineBasicBlock &MBB : MF) { 280 GCNDownwardRPTracker RPT(*LIS); 281 MachineBasicBlock::instr_iterator Next; 282 for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) { 283 MachineInstr &MI = *I; 284 Next = std::next(I); 285 286 if (MI.isMetaInstruction()) 287 continue; 288 289 bool IsVMEM = isVMEMClauseInst(MI); 290 291 if (!isValidClauseInst(MI, IsVMEM)) 292 continue; 293 294 if (!RPT.getNext().isValid()) 295 RPT.reset(MI); 296 else { // Advance the state to the current MI. 297 RPT.advance(MachineBasicBlock::const_iterator(MI)); 298 RPT.advanceBeforeNext(); 299 } 300 301 const GCNRPTracker::LiveRegSet LiveRegsCopy(RPT.getLiveRegs()); 302 RegUse Defs, Uses; 303 if (!processRegUses(MI, Defs, Uses, RPT)) { 304 RPT.reset(MI, &LiveRegsCopy); 305 continue; 306 } 307 308 MachineBasicBlock::iterator LastClauseInst = Next; 309 unsigned Length = 1; 310 for ( ; Next != E && Length < FuncMaxClause; ++Next) { 311 // Debug instructions should not change the kill insertion. 312 if (Next->isMetaInstruction()) 313 continue; 314 315 if (!isValidClauseInst(*Next, IsVMEM)) 316 break; 317 318 // A load from pointer which was loaded inside the same bundle is an 319 // impossible clause because we will need to write and read the same 320 // register inside. In this case processRegUses will return false. 321 if (!processRegUses(*Next, Defs, Uses, RPT)) 322 break; 323 324 LastClauseInst = Next; 325 ++Length; 326 } 327 if (Length < 2) { 328 RPT.reset(MI, &LiveRegsCopy); 329 continue; 330 } 331 332 Changed = true; 333 MFI->limitOccupancy(LastRecordedOccupancy); 334 335 assert(!LastClauseInst->isMetaInstruction()); 336 337 SlotIndex ClauseLiveInIdx = LIS->getInstructionIndex(MI); 338 SlotIndex ClauseLiveOutIdx = 339 LIS->getInstructionIndex(*LastClauseInst).getNextIndex(); 340 341 // Track the last inserted kill. 342 MachineInstrBuilder Kill; 343 344 // Insert one kill per register, with operands covering all necessary 345 // subregisters. 346 for (auto &&R : Uses) { 347 Register Reg = R.first; 348 if (Reg.isPhysical()) 349 continue; 350 351 // Collect the register operands we should extend the live ranges of. 352 SmallVector<std::tuple<unsigned, unsigned>> KillOps; 353 const LiveInterval &LI = LIS->getInterval(R.first); 354 355 if (!LI.hasSubRanges()) { 356 if (!LI.liveAt(ClauseLiveOutIdx)) { 357 KillOps.emplace_back(R.second.first | RegState::Kill, 358 AMDGPU::NoSubRegister); 359 } 360 } else { 361 LaneBitmask KilledMask; 362 for (const LiveInterval::SubRange &SR : LI.subranges()) { 363 if (SR.liveAt(ClauseLiveInIdx) && !SR.liveAt(ClauseLiveOutIdx)) 364 KilledMask |= SR.LaneMask; 365 } 366 367 if (KilledMask.none()) 368 continue; 369 370 SmallVector<unsigned> KilledIndexes; 371 bool Success = TRI->getCoveringSubRegIndexes( 372 MRI->getRegClass(Reg), KilledMask, KilledIndexes); 373 (void)Success; 374 assert(Success && "Failed to find subregister mask to cover lanes"); 375 for (unsigned SubReg : KilledIndexes) { 376 KillOps.emplace_back(R.second.first | RegState::Kill, SubReg); 377 } 378 } 379 380 if (KillOps.empty()) 381 continue; 382 383 // We only want to extend the live ranges of used registers. If they 384 // already have existing uses beyond the bundle, we don't need the kill. 385 // 386 // It's possible all of the use registers were already live past the 387 // bundle. 388 Kill = BuildMI(*MI.getParent(), std::next(LastClauseInst), 389 DebugLoc(), TII->get(AMDGPU::KILL)); 390 for (auto &Op : KillOps) 391 Kill.addUse(Reg, std::get<0>(Op), std::get<1>(Op)); 392 Ind->insertMachineInstrInMaps(*Kill); 393 } 394 395 // Restore the state after processing the end of the bundle. 396 RPT.reset(MI, &LiveRegsCopy); 397 398 if (!Kill) 399 continue; 400 401 for (auto &&R : Defs) { 402 Register Reg = R.first; 403 Uses.erase(Reg); 404 if (Reg.isPhysical()) 405 continue; 406 LIS->removeInterval(Reg); 407 LIS->createAndComputeVirtRegInterval(Reg); 408 } 409 410 for (auto &&R : Uses) { 411 Register Reg = R.first; 412 if (Reg.isPhysical()) 413 continue; 414 LIS->removeInterval(Reg); 415 LIS->createAndComputeVirtRegInterval(Reg); 416 } 417 } 418 } 419 420 return Changed; 421 } 422 423 bool SIFormMemoryClausesLegacy::runOnMachineFunction(MachineFunction &MF) { 424 if (skipFunction(MF.getFunction())) 425 return false; 426 427 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); 428 return SIFormMemoryClausesImpl(LIS).run(MF); 429 } 430 431 PreservedAnalyses 432 SIFormMemoryClausesPass::run(MachineFunction &MF, 433 MachineFunctionAnalysisManager &MFAM) { 434 LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF); 435 SIFormMemoryClausesImpl(&LIS).run(MF); 436 return PreservedAnalyses::all(); 437 } 438