1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIInstrInfo.h" 18 #include "llvm/ADT/SetVector.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "amdgpu-insert-delay-alu" 23 24 namespace { 25 26 class AMDGPUInsertDelayAlu { 27 public: 28 const SIInstrInfo *SII; 29 const TargetRegisterInfo *TRI; 30 31 const TargetSchedModel *SchedModel; 32 33 // Return true if MI waits for all outstanding VALU instructions to complete. 34 static bool instructionWaitsForVALU(const MachineInstr &MI) { 35 // These instruction types wait for VA_VDST==0 before issuing. 36 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | 37 SIInstrFlags::FLAT | SIInstrFlags::MIMG | 38 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; 39 if (MI.getDesc().TSFlags & VA_VDST_0) 40 return true; 41 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || 42 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) 43 return true; 44 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 45 AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0) 46 return true; 47 return false; 48 } 49 50 static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) { 51 // These instruction types wait for VA_SDST==0 before issuing. 52 uint64_t MIFlags = MI.getDesc().TSFlags; 53 if (MIFlags & SIInstrFlags::SMRD) 54 return true; 55 56 if (MIFlags & SIInstrFlags::SALU) { 57 for (auto &Op : MI.operands()) { 58 if (Op.isReg()) 59 return true; 60 } 61 } 62 return false; 63 } 64 65 // Types of delay that can be encoded in an s_delay_alu instruction. 66 enum DelayType { VALU, TRANS, SALU, OTHER }; 67 68 // Get the delay type for an instruction with the specified TSFlags. 69 static DelayType getDelayType(uint64_t TSFlags) { 70 if (TSFlags & SIInstrFlags::TRANS) 71 return TRANS; 72 if (TSFlags & SIInstrFlags::VALU) 73 return VALU; 74 if (TSFlags & SIInstrFlags::SALU) 75 return SALU; 76 return OTHER; 77 } 78 79 // Information about the last instruction(s) that wrote to a particular 80 // regunit. In straight-line code there will only be one such instruction, but 81 // when control flow converges we merge the delay information from each path 82 // to represent the union of the worst-case delays of each type. 83 struct DelayInfo { 84 // One larger than the maximum number of (non-TRANS) VALU instructions we 85 // can encode in an s_delay_alu instruction. 86 static constexpr unsigned VALU_MAX = 5; 87 88 // One larger than the maximum number of TRANS instructions we can encode in 89 // an s_delay_alu instruction. 90 static constexpr unsigned TRANS_MAX = 4; 91 92 // One larger than the maximum number of SALU cycles we can encode in an 93 // s_delay_alu instruction. 94 static constexpr unsigned SALU_CYCLES_MAX = 4; 95 96 // If it was written by a (non-TRANS) VALU, remember how many clock cycles 97 // are left until it completes, and how many other (non-TRANS) VALU we have 98 // seen since it was issued. 99 uint8_t VALUCycles = 0; 100 uint8_t VALUNum = VALU_MAX; 101 102 // If it was written by a TRANS, remember how many clock cycles are left 103 // until it completes, and how many other TRANS we have seen since it was 104 // issued. 105 uint8_t TRANSCycles = 0; 106 uint8_t TRANSNum = TRANS_MAX; 107 // Also remember how many other (non-TRANS) VALU we have seen since it was 108 // issued. When an instruction depends on both a prior TRANS and a prior 109 // non-TRANS VALU, this is used to decide whether to encode a wait for just 110 // one or both of them. 111 uint8_t TRANSNumVALU = VALU_MAX; 112 113 // If it was written by an SALU, remember how many clock cycles are left 114 // until it completes. 115 uint8_t SALUCycles = 0; 116 117 DelayInfo() = default; 118 119 DelayInfo(DelayType Type, unsigned Cycles) { 120 switch (Type) { 121 default: 122 llvm_unreachable("unexpected type"); 123 case VALU: 124 VALUCycles = Cycles; 125 VALUNum = 0; 126 break; 127 case TRANS: 128 TRANSCycles = Cycles; 129 TRANSNum = 0; 130 TRANSNumVALU = 0; 131 break; 132 case SALU: 133 // Guard against pseudo-instructions like SI_CALL which are marked as 134 // SALU but with a very high latency. 135 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX); 136 break; 137 } 138 } 139 140 bool operator==(const DelayInfo &RHS) const { 141 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && 142 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && 143 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; 144 } 145 146 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } 147 148 // Merge another DelayInfo into this one, to represent the union of the 149 // worst-case delays of each type. 150 void merge(const DelayInfo &RHS) { 151 VALUCycles = std::max(VALUCycles, RHS.VALUCycles); 152 VALUNum = std::min(VALUNum, RHS.VALUNum); 153 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); 154 TRANSNum = std::min(TRANSNum, RHS.TRANSNum); 155 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); 156 SALUCycles = std::max(SALUCycles, RHS.SALUCycles); 157 } 158 159 // Update this DelayInfo after issuing an instruction of the specified type. 160 // Cycles is the number of cycles it takes to issue the instruction. Return 161 // true if there is no longer any useful delay info. 162 bool advance(DelayType Type, unsigned Cycles) { 163 bool Erase = true; 164 165 VALUNum += (Type == VALU); 166 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { 167 // Forget about the VALU instruction. It was too far back or has 168 // definitely completed by now. 169 VALUNum = VALU_MAX; 170 VALUCycles = 0; 171 } else { 172 VALUCycles -= Cycles; 173 Erase = false; 174 } 175 176 TRANSNum += (Type == TRANS); 177 TRANSNumVALU += (Type == VALU); 178 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { 179 // Forget about any TRANS instruction. It was too far back or has 180 // definitely completed by now. 181 TRANSNum = TRANS_MAX; 182 TRANSNumVALU = VALU_MAX; 183 TRANSCycles = 0; 184 } else { 185 TRANSCycles -= Cycles; 186 Erase = false; 187 } 188 189 if (SALUCycles <= Cycles) { 190 // Forget about any SALU instruction. It has definitely completed by 191 // now. 192 SALUCycles = 0; 193 } else { 194 SALUCycles -= Cycles; 195 Erase = false; 196 } 197 198 return Erase; 199 } 200 201 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 202 void dump() const { 203 if (VALUCycles) 204 dbgs() << " VALUCycles=" << (int)VALUCycles; 205 if (VALUNum < VALU_MAX) 206 dbgs() << " VALUNum=" << (int)VALUNum; 207 if (TRANSCycles) 208 dbgs() << " TRANSCycles=" << (int)TRANSCycles; 209 if (TRANSNum < TRANS_MAX) 210 dbgs() << " TRANSNum=" << (int)TRANSNum; 211 if (TRANSNumVALU < VALU_MAX) 212 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; 213 if (SALUCycles) 214 dbgs() << " SALUCycles=" << (int)SALUCycles; 215 } 216 #endif 217 }; 218 219 // A map from regunits to the delay info for that regunit. 220 struct DelayState : DenseMap<unsigned, DelayInfo> { 221 // Merge another DelayState into this one by merging the delay info for each 222 // regunit. 223 void merge(const DelayState &RHS) { 224 for (const auto &KV : RHS) { 225 iterator It; 226 bool Inserted; 227 std::tie(It, Inserted) = insert(KV); 228 if (!Inserted) 229 It->second.merge(KV.second); 230 } 231 } 232 233 // Advance the delay info for each regunit, erasing any that are no longer 234 // useful. 235 void advance(DelayType Type, unsigned Cycles) { 236 iterator Next; 237 for (auto I = begin(), E = end(); I != E; I = Next) { 238 Next = std::next(I); 239 if (I->second.advance(Type, Cycles)) 240 erase(I); 241 } 242 } 243 244 void advanceByVALUNum(unsigned VALUNum) { 245 iterator Next; 246 for (auto I = begin(), E = end(); I != E; I = Next) { 247 Next = std::next(I); 248 if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) { 249 erase(I); 250 } 251 } 252 } 253 254 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 255 void dump(const TargetRegisterInfo *TRI) const { 256 if (empty()) { 257 dbgs() << " empty\n"; 258 return; 259 } 260 261 // Dump DelayInfo for each RegUnit in numerical order. 262 SmallVector<const_iterator, 8> Order; 263 Order.reserve(size()); 264 for (const_iterator I = begin(), E = end(); I != E; ++I) 265 Order.push_back(I); 266 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { 267 return A->first < B->first; 268 }); 269 for (const_iterator I : Order) { 270 dbgs() << " " << printRegUnit(I->first, TRI); 271 I->second.dump(); 272 dbgs() << "\n"; 273 } 274 } 275 #endif 276 }; 277 278 // The saved delay state at the end of each basic block. 279 DenseMap<MachineBasicBlock *, DelayState> BlockState; 280 281 // Emit an s_delay_alu instruction if necessary before MI. 282 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, 283 MachineInstr *LastDelayAlu) { 284 unsigned Imm = 0; 285 286 // Wait for a TRANS instruction. 287 if (Delay.TRANSNum < DelayInfo::TRANS_MAX) 288 Imm |= 4 + Delay.TRANSNum; 289 290 // Wait for a VALU instruction (if it's more recent than any TRANS 291 // instruction that we're also waiting for). 292 if (Delay.VALUNum < DelayInfo::VALU_MAX && 293 Delay.VALUNum <= Delay.TRANSNumVALU) { 294 if (Imm & 0xf) 295 Imm |= Delay.VALUNum << 7; 296 else 297 Imm |= Delay.VALUNum; 298 } 299 300 // Wait for an SALU instruction. 301 if (Delay.SALUCycles) { 302 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); 303 if (Imm & 0x780) { 304 // We have already encoded a VALU and a TRANS delay. There's no room in 305 // the encoding for an SALU delay as well, so just drop it. 306 } else if (Imm & 0xf) { 307 Imm |= (Delay.SALUCycles + 8) << 7; 308 } else { 309 Imm |= Delay.SALUCycles + 8; 310 } 311 } 312 313 // Don't emit the s_delay_alu instruction if there's nothing to wait for. 314 if (!Imm) 315 return LastDelayAlu; 316 317 // If we only need to wait for one instruction, try encoding it in the last 318 // s_delay_alu that we emitted. 319 if (!(Imm & 0x780) && LastDelayAlu) { 320 unsigned Skip = 0; 321 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), 322 E = MachineBasicBlock::instr_iterator(MI); 323 ++I != E;) { 324 if (!I->isBundle() && !I->isMetaInstruction()) 325 ++Skip; 326 } 327 if (Skip < 6) { 328 MachineOperand &Op = LastDelayAlu->getOperand(0); 329 unsigned LastImm = Op.getImm(); 330 assert((LastImm & ~0xf) == 0 && 331 "Remembered an s_delay_alu with no room for another delay!"); 332 LastImm |= Imm << 7 | Skip << 4; 333 Op.setImm(LastImm); 334 return nullptr; 335 } 336 } 337 338 auto &MBB = *MI.getParent(); 339 MachineInstr *DelayAlu = 340 BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); 341 // Remember the s_delay_alu for next time if there is still room in it to 342 // encode another delay. 343 return (Imm & 0x780) ? nullptr : DelayAlu; 344 } 345 346 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 347 DelayState State; 348 for (auto *Pred : MBB.predecessors()) 349 State.merge(BlockState[Pred]); 350 351 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) 352 << "\n"; 353 State.dump(TRI);); 354 355 bool Changed = false; 356 MachineInstr *LastDelayAlu = nullptr; 357 358 MCRegUnit LastSGPRFromVALU = 0; 359 // Iterate over the contents of bundles, but don't emit any instructions 360 // inside a bundle. 361 for (auto &MI : MBB.instrs()) { 362 if (MI.isBundle() || MI.isMetaInstruction()) 363 continue; 364 365 // Ignore some more instructions that do not generate any code. 366 switch (MI.getOpcode()) { 367 case AMDGPU::SI_RETURN_TO_EPILOG: 368 continue; 369 } 370 371 DelayType Type = getDelayType(MI.getDesc().TSFlags); 372 373 if (instructionWaitsForSGPRWrites(MI)) { 374 auto It = State.find(LastSGPRFromVALU); 375 if (It != State.end()) { 376 DelayInfo Info = It->getSecond(); 377 State.advanceByVALUNum(Info.VALUNum); 378 LastSGPRFromVALU = 0; 379 } 380 } 381 382 if (instructionWaitsForVALU(MI)) { 383 // Forget about all outstanding VALU delays. 384 // TODO: This is overkill since it also forgets about SALU delays. 385 State = DelayState(); 386 } else if (Type != OTHER) { 387 DelayInfo Delay; 388 // TODO: Scan implicit uses too? 389 for (const auto &Op : MI.explicit_uses()) { 390 if (Op.isReg()) { 391 // One of the operands of the writelane is also the output operand. 392 // This creates the insertion of redundant delays. Hence, we have to 393 // ignore this operand. 394 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) 395 continue; 396 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { 397 auto It = State.find(Unit); 398 if (It != State.end()) { 399 Delay.merge(It->second); 400 State.erase(Unit); 401 } 402 } 403 } 404 } 405 406 if (SII->isVALU(MI.getOpcode())) { 407 for (const auto &Op : MI.defs()) { 408 Register Reg = Op.getReg(); 409 if (AMDGPU::isSGPR(Reg, TRI)) { 410 LastSGPRFromVALU = *TRI->regunits(Reg).begin(); 411 break; 412 } 413 } 414 } 415 416 if (Emit && !MI.isBundledWithPred()) { 417 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or 418 // just ignore them? 419 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); 420 } 421 } 422 423 if (Type != OTHER) { 424 // TODO: Scan implicit defs too? 425 for (const auto &Op : MI.defs()) { 426 unsigned Latency = SchedModel->computeOperandLatency( 427 &MI, Op.getOperandNo(), nullptr, 0); 428 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) 429 State[Unit] = DelayInfo(Type, Latency); 430 } 431 } 432 433 // Advance by the number of cycles it takes to issue this instruction. 434 // TODO: Use a more advanced model that accounts for instructions that 435 // take multiple cycles to issue on a particular pipeline. 436 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); 437 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM 438 // instructions on the assumption that they will usually have to be issued 439 // twice? 440 State.advance(Type, Cycles); 441 442 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); 443 } 444 445 if (Emit) { 446 assert(State == BlockState[&MBB] && 447 "Basic block state should not have changed on final pass!"); 448 } else if (DelayState &BS = BlockState[&MBB]; State != BS) { 449 BS = std::move(State); 450 Changed = true; 451 } 452 return Changed; 453 } 454 455 bool run(MachineFunction &MF) { 456 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() 457 << "\n"); 458 459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 460 if (!ST.hasDelayAlu()) 461 return false; 462 463 SII = ST.getInstrInfo(); 464 TRI = ST.getRegisterInfo(); 465 SchedModel = &SII->getSchedModel(); 466 467 // Calculate the delay state for each basic block, iterating until we reach 468 // a fixed point. 469 SetVector<MachineBasicBlock *> WorkList; 470 for (auto &MBB : reverse(MF)) 471 WorkList.insert(&MBB); 472 while (!WorkList.empty()) { 473 auto &MBB = *WorkList.pop_back_val(); 474 bool Changed = runOnMachineBasicBlock(MBB, false); 475 if (Changed) 476 WorkList.insert_range(MBB.successors()); 477 } 478 479 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); 480 481 // Make one last pass over all basic blocks to emit s_delay_alu 482 // instructions. 483 bool Changed = false; 484 for (auto &MBB : MF) 485 Changed |= runOnMachineBasicBlock(MBB, true); 486 return Changed; 487 } 488 }; 489 490 class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass { 491 public: 492 static char ID; 493 494 AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {} 495 496 void getAnalysisUsage(AnalysisUsage &AU) const override { 497 AU.setPreservesCFG(); 498 MachineFunctionPass::getAnalysisUsage(AU); 499 } 500 501 bool runOnMachineFunction(MachineFunction &MF) override { 502 if (skipFunction(MF.getFunction())) 503 return false; 504 AMDGPUInsertDelayAlu Impl; 505 return Impl.run(MF); 506 } 507 }; 508 } // namespace 509 510 PreservedAnalyses 511 AMDGPUInsertDelayAluPass::run(MachineFunction &MF, 512 MachineFunctionAnalysisManager &MFAM) { 513 if (!AMDGPUInsertDelayAlu().run(MF)) 514 return PreservedAnalyses::all(); 515 auto PA = getMachineFunctionPassPreservedAnalyses(); 516 PA.preserveSet<CFGAnalyses>(); 517 return PA; 518 } // end namespace llvm 519 520 char AMDGPUInsertDelayAluLegacy::ID = 0; 521 522 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID; 523 524 INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE, 525 "AMDGPU Insert Delay ALU", false, false) 526