1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIInstrInfo.h" 18 #include "llvm/ADT/SetVector.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "amdgpu-insert-delay-alu" 23 24 namespace { 25 26 class AMDGPUInsertDelayAlu : public MachineFunctionPass { 27 public: 28 static char ID; 29 30 const SIInstrInfo *SII; 31 const TargetRegisterInfo *TRI; 32 33 TargetSchedModel SchedModel; 34 35 AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} 36 37 void getAnalysisUsage(AnalysisUsage &AU) const override { 38 AU.setPreservesCFG(); 39 MachineFunctionPass::getAnalysisUsage(AU); 40 } 41 42 // Return true if MI waits for all outstanding VALU instructions to complete. 43 static bool instructionWaitsForVALU(const MachineInstr &MI) { 44 // These instruction types wait for VA_VDST==0 before issuing. 45 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | 46 SIInstrFlags::FLAT | SIInstrFlags::MIMG | 47 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; 48 if (MI.getDesc().TSFlags & VA_VDST_0) 49 return true; 50 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || 51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) 52 return true; 53 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 54 AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0) 55 return true; 56 return false; 57 } 58 59 // Types of delay that can be encoded in an s_delay_alu instruction. 60 enum DelayType { VALU, TRANS, SALU, OTHER }; 61 62 // Get the delay type for an instruction with the specified TSFlags. 63 static DelayType getDelayType(uint64_t TSFlags) { 64 if (TSFlags & SIInstrFlags::TRANS) 65 return TRANS; 66 if (TSFlags & SIInstrFlags::VALU) 67 return VALU; 68 if (TSFlags & SIInstrFlags::SALU) 69 return SALU; 70 return OTHER; 71 } 72 73 // Information about the last instruction(s) that wrote to a particular 74 // regunit. In straight-line code there will only be one such instruction, but 75 // when control flow converges we merge the delay information from each path 76 // to represent the union of the worst-case delays of each type. 77 struct DelayInfo { 78 // One larger than the maximum number of (non-TRANS) VALU instructions we 79 // can encode in an s_delay_alu instruction. 80 static constexpr unsigned VALU_MAX = 5; 81 82 // One larger than the maximum number of TRANS instructions we can encode in 83 // an s_delay_alu instruction. 84 static constexpr unsigned TRANS_MAX = 4; 85 86 // One larger than the maximum number of SALU cycles we can encode in an 87 // s_delay_alu instruction. 88 static constexpr unsigned SALU_CYCLES_MAX = 4; 89 90 // If it was written by a (non-TRANS) VALU, remember how many clock cycles 91 // are left until it completes, and how many other (non-TRANS) VALU we have 92 // seen since it was issued. 93 uint8_t VALUCycles = 0; 94 uint8_t VALUNum = VALU_MAX; 95 96 // If it was written by a TRANS, remember how many clock cycles are left 97 // until it completes, and how many other TRANS we have seen since it was 98 // issued. 99 uint8_t TRANSCycles = 0; 100 uint8_t TRANSNum = TRANS_MAX; 101 // Also remember how many other (non-TRANS) VALU we have seen since it was 102 // issued. When an instruction depends on both a prior TRANS and a prior 103 // non-TRANS VALU, this is used to decide whether to encode a wait for just 104 // one or both of them. 105 uint8_t TRANSNumVALU = VALU_MAX; 106 107 // If it was written by an SALU, remember how many clock cycles are left 108 // until it completes. 109 uint8_t SALUCycles = 0; 110 111 DelayInfo() = default; 112 113 DelayInfo(DelayType Type, unsigned Cycles) { 114 switch (Type) { 115 default: 116 llvm_unreachable("unexpected type"); 117 case VALU: 118 VALUCycles = Cycles; 119 VALUNum = 0; 120 break; 121 case TRANS: 122 TRANSCycles = Cycles; 123 TRANSNum = 0; 124 TRANSNumVALU = 0; 125 break; 126 case SALU: 127 // Guard against pseudo-instructions like SI_CALL which are marked as 128 // SALU but with a very high latency. 129 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX); 130 break; 131 } 132 } 133 134 bool operator==(const DelayInfo &RHS) const { 135 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && 136 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && 137 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; 138 } 139 140 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } 141 142 // Merge another DelayInfo into this one, to represent the union of the 143 // worst-case delays of each type. 144 void merge(const DelayInfo &RHS) { 145 VALUCycles = std::max(VALUCycles, RHS.VALUCycles); 146 VALUNum = std::min(VALUNum, RHS.VALUNum); 147 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); 148 TRANSNum = std::min(TRANSNum, RHS.TRANSNum); 149 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); 150 SALUCycles = std::max(SALUCycles, RHS.SALUCycles); 151 } 152 153 // Update this DelayInfo after issuing an instruction. IsVALU should be 1 154 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing 155 // a TRANS, else 0. Cycles is the number of cycles it takes to issue the 156 // instruction. Return true if there is no longer any useful delay info. 157 bool advance(DelayType Type, unsigned Cycles) { 158 bool Erase = true; 159 160 VALUNum += (Type == VALU); 161 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { 162 // Forget about the VALU instruction. It was too far back or has 163 // definitely completed by now. 164 VALUNum = VALU_MAX; 165 VALUCycles = 0; 166 } else { 167 VALUCycles -= Cycles; 168 Erase = false; 169 } 170 171 TRANSNum += (Type == TRANS); 172 TRANSNumVALU += (Type == VALU); 173 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { 174 // Forget about any TRANS instruction. It was too far back or has 175 // definitely completed by now. 176 TRANSNum = TRANS_MAX; 177 TRANSNumVALU = VALU_MAX; 178 TRANSCycles = 0; 179 } else { 180 TRANSCycles -= Cycles; 181 Erase = false; 182 } 183 184 if (SALUCycles <= Cycles) { 185 // Forget about any SALU instruction. It has definitely completed by 186 // now. 187 SALUCycles = 0; 188 } else { 189 SALUCycles -= Cycles; 190 Erase = false; 191 } 192 193 return Erase; 194 } 195 196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 197 void dump() const { 198 if (VALUCycles) 199 dbgs() << " VALUCycles=" << (int)VALUCycles; 200 if (VALUNum < VALU_MAX) 201 dbgs() << " VALUNum=" << (int)VALUNum; 202 if (TRANSCycles) 203 dbgs() << " TRANSCycles=" << (int)TRANSCycles; 204 if (TRANSNum < TRANS_MAX) 205 dbgs() << " TRANSNum=" << (int)TRANSNum; 206 if (TRANSNumVALU < VALU_MAX) 207 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; 208 if (SALUCycles) 209 dbgs() << " SALUCycles=" << (int)SALUCycles; 210 } 211 #endif 212 }; 213 214 // A map from regunits to the delay info for that regunit. 215 struct DelayState : DenseMap<unsigned, DelayInfo> { 216 // Merge another DelayState into this one by merging the delay info for each 217 // regunit. 218 void merge(const DelayState &RHS) { 219 for (const auto &KV : RHS) { 220 iterator It; 221 bool Inserted; 222 std::tie(It, Inserted) = insert(KV); 223 if (!Inserted) 224 It->second.merge(KV.second); 225 } 226 } 227 228 // Advance the delay info for each regunit, erasing any that are no longer 229 // useful. 230 void advance(DelayType Type, unsigned Cycles) { 231 iterator Next; 232 for (auto I = begin(), E = end(); I != E; I = Next) { 233 Next = std::next(I); 234 if (I->second.advance(Type, Cycles)) 235 erase(I); 236 } 237 } 238 239 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 240 void dump(const TargetRegisterInfo *TRI) const { 241 if (empty()) { 242 dbgs() << " empty\n"; 243 return; 244 } 245 246 // Dump DelayInfo for each RegUnit in numerical order. 247 SmallVector<const_iterator, 8> Order; 248 Order.reserve(size()); 249 for (const_iterator I = begin(), E = end(); I != E; ++I) 250 Order.push_back(I); 251 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { 252 return A->first < B->first; 253 }); 254 for (const_iterator I : Order) { 255 dbgs() << " " << printRegUnit(I->first, TRI); 256 I->second.dump(); 257 dbgs() << "\n"; 258 } 259 } 260 #endif 261 }; 262 263 // The saved delay state at the end of each basic block. 264 DenseMap<MachineBasicBlock *, DelayState> BlockState; 265 266 // Emit an s_delay_alu instruction if necessary before MI. 267 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, 268 MachineInstr *LastDelayAlu) { 269 unsigned Imm = 0; 270 271 // Wait for a TRANS instruction. 272 if (Delay.TRANSNum < DelayInfo::TRANS_MAX) 273 Imm |= 4 + Delay.TRANSNum; 274 275 // Wait for a VALU instruction (if it's more recent than any TRANS 276 // instruction that we're also waiting for). 277 if (Delay.VALUNum < DelayInfo::VALU_MAX && 278 Delay.VALUNum <= Delay.TRANSNumVALU) { 279 if (Imm & 0xf) 280 Imm |= Delay.VALUNum << 7; 281 else 282 Imm |= Delay.VALUNum; 283 } 284 285 // Wait for an SALU instruction. 286 if (Delay.SALUCycles) { 287 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); 288 if (Imm & 0x780) { 289 // We have already encoded a VALU and a TRANS delay. There's no room in 290 // the encoding for an SALU delay as well, so just drop it. 291 } else if (Imm & 0xf) { 292 Imm |= (Delay.SALUCycles + 8) << 7; 293 } else { 294 Imm |= Delay.SALUCycles + 8; 295 } 296 } 297 298 // Don't emit the s_delay_alu instruction if there's nothing to wait for. 299 if (!Imm) 300 return LastDelayAlu; 301 302 // If we only need to wait for one instruction, try encoding it in the last 303 // s_delay_alu that we emitted. 304 if (!(Imm & 0x780) && LastDelayAlu) { 305 unsigned Skip = 0; 306 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), 307 E = MachineBasicBlock::instr_iterator(MI); 308 ++I != E;) { 309 if (!I->isBundle() && !I->isMetaInstruction()) 310 ++Skip; 311 } 312 if (Skip < 6) { 313 MachineOperand &Op = LastDelayAlu->getOperand(0); 314 unsigned LastImm = Op.getImm(); 315 assert((LastImm & ~0xf) == 0 && 316 "Remembered an s_delay_alu with no room for another delay!"); 317 LastImm |= Imm << 7 | Skip << 4; 318 Op.setImm(LastImm); 319 return nullptr; 320 } 321 } 322 323 auto &MBB = *MI.getParent(); 324 MachineInstr *DelayAlu = 325 BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); 326 // Remember the s_delay_alu for next time if there is still room in it to 327 // encode another delay. 328 return (Imm & 0x780) ? nullptr : DelayAlu; 329 } 330 331 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 332 DelayState State; 333 for (auto *Pred : MBB.predecessors()) 334 State.merge(BlockState[Pred]); 335 336 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) 337 << "\n"; 338 State.dump(TRI);); 339 340 bool Changed = false; 341 MachineInstr *LastDelayAlu = nullptr; 342 343 // Iterate over the contents of bundles, but don't emit any instructions 344 // inside a bundle. 345 for (auto &MI : MBB.instrs()) { 346 if (MI.isBundle() || MI.isMetaInstruction()) 347 continue; 348 349 // Ignore some more instructions that do not generate any code. 350 switch (MI.getOpcode()) { 351 case AMDGPU::SI_RETURN_TO_EPILOG: 352 continue; 353 } 354 355 DelayType Type = getDelayType(MI.getDesc().TSFlags); 356 357 if (instructionWaitsForVALU(MI)) { 358 // Forget about all outstanding VALU delays. 359 // TODO: This is overkill since it also forgets about SALU delays. 360 State = DelayState(); 361 } else if (Type != OTHER) { 362 DelayInfo Delay; 363 // TODO: Scan implicit uses too? 364 for (const auto &Op : MI.explicit_uses()) { 365 if (Op.isReg()) { 366 // One of the operands of the writelane is also the output operand. 367 // This creates the insertion of redundant delays. Hence, we have to 368 // ignore this operand. 369 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) 370 continue; 371 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { 372 auto It = State.find(Unit); 373 if (It != State.end()) { 374 Delay.merge(It->second); 375 State.erase(Unit); 376 } 377 } 378 } 379 } 380 if (Emit && !MI.isBundledWithPred()) { 381 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or 382 // just ignore them? 383 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); 384 } 385 } 386 387 if (Type != OTHER) { 388 // TODO: Scan implicit defs too? 389 for (const auto &Op : MI.defs()) { 390 unsigned Latency = SchedModel.computeOperandLatency( 391 &MI, Op.getOperandNo(), nullptr, 0); 392 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) 393 State[Unit] = DelayInfo(Type, Latency); 394 } 395 } 396 397 // Advance by the number of cycles it takes to issue this instruction. 398 // TODO: Use a more advanced model that accounts for instructions that 399 // take multiple cycles to issue on a particular pipeline. 400 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); 401 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM 402 // instructions on the assumption that they will usually have to be issued 403 // twice? 404 State.advance(Type, Cycles); 405 406 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); 407 } 408 409 if (Emit) { 410 assert(State == BlockState[&MBB] && 411 "Basic block state should not have changed on final pass!"); 412 } else if (State != BlockState[&MBB]) { 413 BlockState[&MBB] = std::move(State); 414 Changed = true; 415 } 416 return Changed; 417 } 418 419 bool runOnMachineFunction(MachineFunction &MF) override { 420 if (skipFunction(MF.getFunction())) 421 return false; 422 423 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() 424 << "\n"); 425 426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 427 if (!ST.hasDelayAlu()) 428 return false; 429 430 SII = ST.getInstrInfo(); 431 TRI = ST.getRegisterInfo(); 432 433 SchedModel.init(&ST); 434 435 // Calculate the delay state for each basic block, iterating until we reach 436 // a fixed point. 437 SetVector<MachineBasicBlock *> WorkList; 438 for (auto &MBB : reverse(MF)) 439 WorkList.insert(&MBB); 440 while (!WorkList.empty()) { 441 auto &MBB = *WorkList.pop_back_val(); 442 bool Changed = runOnMachineBasicBlock(MBB, false); 443 if (Changed) 444 WorkList.insert(MBB.succ_begin(), MBB.succ_end()); 445 } 446 447 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); 448 449 // Make one last pass over all basic blocks to emit s_delay_alu 450 // instructions. 451 bool Changed = false; 452 for (auto &MBB : MF) 453 Changed |= runOnMachineBasicBlock(MBB, true); 454 return Changed; 455 } 456 }; 457 458 } // namespace 459 460 char AMDGPUInsertDelayAlu::ID = 0; 461 462 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; 463 464 INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", 465 false, false) 466