1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIInstrInfo.h" 18 #include "llvm/ADT/SetVector.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "amdgpu-insert-delay-alu" 23 24 namespace { 25 26 class AMDGPUInsertDelayAlu : public MachineFunctionPass { 27 public: 28 static char ID; 29 30 const SIInstrInfo *SII; 31 const TargetRegisterInfo *TRI; 32 33 TargetSchedModel SchedModel; 34 35 AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} 36 37 void getAnalysisUsage(AnalysisUsage &AU) const override { 38 AU.setPreservesCFG(); 39 MachineFunctionPass::getAnalysisUsage(AU); 40 } 41 42 // Return true if MI waits for all outstanding VALU instructions to complete. 43 static bool instructionWaitsForVALU(const MachineInstr &MI) { 44 // These instruction types wait for VA_VDST==0 before issuing. 45 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | 46 SIInstrFlags::FLAT | SIInstrFlags::MIMG | 47 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; 48 if (MI.getDesc().TSFlags & VA_VDST_0) 49 return true; 50 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || 51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) 52 return true; 53 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 54 (MI.getOperand(0).getImm() & 0xf000) == 0) 55 return true; 56 return false; 57 } 58 59 // Types of delay that can be encoded in an s_delay_alu instruction. 60 enum DelayType { VALU, TRANS, SALU, OTHER }; 61 62 // Get the delay type for an instruction with the specified TSFlags. 63 static DelayType getDelayType(uint64_t TSFlags) { 64 if (TSFlags & SIInstrFlags::TRANS) 65 return TRANS; 66 if (TSFlags & SIInstrFlags::VALU) 67 return VALU; 68 if (TSFlags & SIInstrFlags::SALU) 69 return SALU; 70 return OTHER; 71 } 72 73 // Information about the last instruction(s) that wrote to a particular 74 // regunit. In straight-line code there will only be one such instruction, but 75 // when control flow converges we merge the delay information from each path 76 // to represent the union of the worst-case delays of each type. 77 struct DelayInfo { 78 // One larger than the maximum number of (non-TRANS) VALU instructions we 79 // can encode in an s_delay_alu instruction. 80 static const unsigned VALU_MAX = 5; 81 82 // One larger than the maximum number of TRANS instructions we can encode in 83 // an s_delay_alu instruction. 84 static const unsigned TRANS_MAX = 4; 85 86 // If it was written by a (non-TRANS) VALU, remember how many clock cycles 87 // are left until it completes, and how many other (non-TRANS) VALU we have 88 // seen since it was issued. 89 uint8_t VALUCycles = 0; 90 uint8_t VALUNum = VALU_MAX; 91 92 // If it was written by a TRANS, remember how many clock cycles are left 93 // until it completes, and how many other TRANS we have seen since it was 94 // issued. 95 uint8_t TRANSCycles = 0; 96 uint8_t TRANSNum = TRANS_MAX; 97 // Also remember how many other (non-TRANS) VALU we have seen since it was 98 // issued. When an instruction depends on both a prior TRANS and a prior 99 // non-TRANS VALU, this is used to decide whether to encode a wait for just 100 // one or both of them. 101 uint8_t TRANSNumVALU = VALU_MAX; 102 103 // If it was written by an SALU, remember how many clock cycles are left 104 // until it completes. 105 uint8_t SALUCycles = 0; 106 107 DelayInfo() = default; 108 109 DelayInfo(DelayType Type, unsigned Cycles) { 110 switch (Type) { 111 default: 112 llvm_unreachable("unexpected type"); 113 case VALU: 114 VALUCycles = Cycles; 115 VALUNum = 0; 116 break; 117 case TRANS: 118 TRANSCycles = Cycles; 119 TRANSNum = 0; 120 TRANSNumVALU = 0; 121 break; 122 case SALU: 123 SALUCycles = Cycles; 124 break; 125 } 126 } 127 128 bool operator==(const DelayInfo &RHS) const { 129 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && 130 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && 131 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; 132 } 133 134 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } 135 136 // Merge another DelayInfo into this one, to represent the union of the 137 // worst-case delays of each type. 138 void merge(const DelayInfo &RHS) { 139 VALUCycles = std::max(VALUCycles, RHS.VALUCycles); 140 VALUNum = std::min(VALUNum, RHS.VALUNum); 141 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); 142 TRANSNum = std::min(TRANSNum, RHS.TRANSNum); 143 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); 144 SALUCycles = std::max(SALUCycles, RHS.SALUCycles); 145 } 146 147 // Update this DelayInfo after issuing an instruction. IsVALU should be 1 148 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing 149 // a TRANS, else 0. Cycles is the number of cycles it takes to issue the 150 // instruction. Return true if there is no longer any useful delay info. 151 bool advance(DelayType Type, unsigned Cycles) { 152 bool Erase = true; 153 154 VALUNum += (Type == VALU); 155 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { 156 // Forget about the VALU instruction. It was too far back or has 157 // definitely completed by now. 158 VALUNum = VALU_MAX; 159 VALUCycles = 0; 160 } else { 161 VALUCycles -= Cycles; 162 Erase = false; 163 } 164 165 TRANSNum += (Type == TRANS); 166 TRANSNumVALU += (Type == VALU); 167 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { 168 // Forget about any TRANS instruction. It was too far back or has 169 // definitely completed by now. 170 TRANSNum = TRANS_MAX; 171 TRANSNumVALU = VALU_MAX; 172 TRANSCycles = 0; 173 } else { 174 TRANSCycles -= Cycles; 175 Erase = false; 176 } 177 178 if (SALUCycles <= Cycles) { 179 // Forget about any SALU instruction. It has definitely completed by 180 // now. 181 SALUCycles = 0; 182 } else { 183 SALUCycles -= Cycles; 184 Erase = false; 185 } 186 187 return Erase; 188 } 189 190 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 191 void dump() const { 192 if (VALUCycles) 193 dbgs() << " VALUCycles=" << (int)VALUCycles; 194 if (VALUNum < VALU_MAX) 195 dbgs() << " VALUNum=" << (int)VALUNum; 196 if (TRANSCycles) 197 dbgs() << " TRANSCycles=" << (int)TRANSCycles; 198 if (TRANSNum < TRANS_MAX) 199 dbgs() << " TRANSNum=" << (int)TRANSNum; 200 if (TRANSNumVALU < VALU_MAX) 201 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; 202 if (SALUCycles) 203 dbgs() << " SALUCycles=" << (int)SALUCycles; 204 } 205 #endif 206 }; 207 208 // A map from regunits to the delay info for that regunit. 209 struct DelayState : DenseMap<unsigned, DelayInfo> { 210 // Merge another DelayState into this one by merging the delay info for each 211 // regunit. 212 void merge(const DelayState &RHS) { 213 for (const auto &KV : RHS) { 214 iterator It; 215 bool Inserted; 216 std::tie(It, Inserted) = insert(KV); 217 if (!Inserted) 218 It->second.merge(KV.second); 219 } 220 } 221 222 // Advance the delay info for each regunit, erasing any that are no longer 223 // useful. 224 void advance(DelayType Type, unsigned Cycles) { 225 iterator Next; 226 for (auto I = begin(), E = end(); I != E; I = Next) { 227 Next = std::next(I); 228 if (I->second.advance(Type, Cycles)) 229 erase(I); 230 } 231 } 232 233 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 234 void dump(const TargetRegisterInfo *TRI) const { 235 if (empty()) { 236 dbgs() << " empty\n"; 237 return; 238 } 239 240 // Dump DelayInfo for each RegUnit in numerical order. 241 SmallVector<const_iterator, 8> Order; 242 Order.reserve(size()); 243 for (const_iterator I = begin(), E = end(); I != E; ++I) 244 Order.push_back(I); 245 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { 246 return A->first < B->first; 247 }); 248 for (const_iterator I : Order) { 249 dbgs() << " " << printRegUnit(I->first, TRI); 250 I->second.dump(); 251 dbgs() << "\n"; 252 } 253 } 254 #endif 255 }; 256 257 // The saved delay state at the end of each basic block. 258 DenseMap<MachineBasicBlock *, DelayState> BlockState; 259 260 // Emit an s_delay_alu instruction if necessary before MI. 261 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, 262 MachineInstr *LastDelayAlu) { 263 unsigned Imm = 0; 264 265 // Wait for a TRANS instruction. 266 if (Delay.TRANSNum < DelayInfo::TRANS_MAX) 267 Imm |= 4 + Delay.TRANSNum; 268 269 // Wait for a VALU instruction (if it's more recent than any TRANS 270 // instruction that we're also waiting for). 271 if (Delay.VALUNum < DelayInfo::VALU_MAX && 272 Delay.VALUNum <= Delay.TRANSNumVALU) { 273 if (Imm & 0xf) 274 Imm |= Delay.VALUNum << 7; 275 else 276 Imm |= Delay.VALUNum; 277 } 278 279 // Wait for an SALU instruction. 280 if (Delay.SALUCycles) { 281 if (Imm & 0x780) { 282 // We have already encoded a VALU and a TRANS delay. There's no room in 283 // the encoding for an SALU delay as well, so just drop it. 284 } else if (Imm & 0xf) { 285 Imm |= (Delay.SALUCycles + 8) << 7; 286 } else { 287 Imm |= Delay.SALUCycles + 8; 288 } 289 } 290 291 // Don't emit the s_delay_alu instruction if there's nothing to wait for. 292 if (!Imm) 293 return LastDelayAlu; 294 295 // If we only need to wait for one instruction, try encoding it in the last 296 // s_delay_alu that we emitted. 297 if (!(Imm & 0x780) && LastDelayAlu) { 298 unsigned Skip = 0; 299 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), 300 E = MachineBasicBlock::instr_iterator(MI); 301 ++I != E;) { 302 if (!I->isBundle() && !I->isMetaInstruction()) 303 ++Skip; 304 } 305 if (Skip < 6) { 306 MachineOperand &Op = LastDelayAlu->getOperand(0); 307 unsigned LastImm = Op.getImm(); 308 assert((LastImm & ~0xf) == 0 && 309 "Remembered an s_delay_alu with no room for another delay!"); 310 LastImm |= Imm << 7 | Skip << 4; 311 Op.setImm(LastImm); 312 return nullptr; 313 } 314 } 315 316 auto &MBB = *MI.getParent(); 317 MachineInstr *DelayAlu = 318 BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); 319 // Remember the s_delay_alu for next time if there is still room in it to 320 // encode another delay. 321 return (Imm & 0x780) ? nullptr : DelayAlu; 322 } 323 324 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 325 DelayState State; 326 for (auto *Pred : MBB.predecessors()) 327 State.merge(BlockState[Pred]); 328 329 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) 330 << "\n"; 331 State.dump(TRI);); 332 333 bool Changed = false; 334 MachineInstr *LastDelayAlu = nullptr; 335 336 // Iterate over the contents of bundles, but don't emit any instructions 337 // inside a bundle. 338 for (auto &MI : MBB.instrs()) { 339 if (MI.isBundle() || MI.isMetaInstruction()) 340 continue; 341 342 // Ignore some more instructions that do not generate any code. 343 switch (MI.getOpcode()) { 344 case AMDGPU::SI_RETURN_TO_EPILOG: 345 continue; 346 } 347 348 DelayType Type = getDelayType(MI.getDesc().TSFlags); 349 350 if (instructionWaitsForVALU(MI)) { 351 // Forget about all outstanding VALU delays. 352 State = DelayState(); 353 } else if (Type != OTHER) { 354 DelayInfo Delay; 355 // TODO: Scan implicit uses too? 356 for (const auto &Op : MI.explicit_uses()) { 357 if (Op.isReg()) { 358 // One of the operands of the writelane is also the output operand. 359 // This creates the insertion of redundant delays. Hence, we have to 360 // ignore this operand. 361 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) 362 continue; 363 for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { 364 auto It = State.find(*UI); 365 if (It != State.end()) { 366 Delay.merge(It->second); 367 State.erase(*UI); 368 } 369 } 370 } 371 } 372 if (Emit && !MI.isBundledWithPred()) { 373 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or 374 // just ignore them? 375 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); 376 } 377 } 378 379 if (Type != OTHER) { 380 // TODO: Scan implicit defs too? 381 for (const auto &Op : MI.defs()) { 382 unsigned Latency = SchedModel.computeOperandLatency( 383 &MI, MI.getOperandNo(&Op), nullptr, 0); 384 for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) 385 State[*UI] = DelayInfo(Type, Latency); 386 } 387 } 388 389 // Advance by the number of cycles it takes to issue this instruction. 390 // TODO: Use a more advanced model that accounts for instructions that 391 // take multiple cycles to issue on a particular pipeline. 392 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); 393 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM 394 // instructions on the assumption that they will usually have to be issued 395 // twice? 396 State.advance(Type, Cycles); 397 398 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); 399 } 400 401 if (Emit) { 402 assert(State == BlockState[&MBB] && 403 "Basic block state should not have changed on final pass!"); 404 } else if (State != BlockState[&MBB]) { 405 BlockState[&MBB] = std::move(State); 406 Changed = true; 407 } 408 return Changed; 409 } 410 411 bool runOnMachineFunction(MachineFunction &MF) override { 412 if (skipFunction(MF.getFunction())) 413 return false; 414 415 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() 416 << "\n"); 417 418 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 419 if (!ST.hasDelayAlu()) 420 return false; 421 422 SII = ST.getInstrInfo(); 423 TRI = ST.getRegisterInfo(); 424 425 SchedModel.init(&ST); 426 427 // Calculate the delay state for each basic block, iterating until we reach 428 // a fixed point. 429 SetVector<MachineBasicBlock *> WorkList; 430 for (auto &MBB : reverse(MF)) 431 WorkList.insert(&MBB); 432 while (!WorkList.empty()) { 433 auto &MBB = *WorkList.pop_back_val(); 434 bool Changed = runOnMachineBasicBlock(MBB, false); 435 if (Changed) 436 WorkList.insert(MBB.succ_begin(), MBB.succ_end()); 437 } 438 439 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); 440 441 // Make one last pass over all basic blocks to emit s_delay_alu 442 // instructions. 443 bool Changed = false; 444 for (auto &MBB : MF) 445 Changed |= runOnMachineBasicBlock(MBB, true); 446 return Changed; 447 } 448 }; 449 450 } // namespace 451 452 char AMDGPUInsertDelayAlu::ID = 0; 453 454 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; 455 456 INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", 457 false, false) 458