1 //===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUWaitSGPRHazards.h" 15 #include "AMDGPU.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "SIInstrInfo.h" 19 #include "llvm/ADT/SetVector.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "amdgpu-wait-sgpr-hazards" 24 25 static cl::opt<bool> GlobalEnableSGPRHazardWaits( 26 "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, 27 cl::desc("Enable required s_wait_alu on SGPR hazards")); 28 29 static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary( 30 "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, 31 cl::desc("Cull hazards on function boundaries")); 32 33 static cl::opt<bool> 34 GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", 35 cl::init(false), cl::Hidden, 36 cl::desc("Cull hazards on memory waits")); 37 38 static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold( 39 "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, 40 cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " 41 "wait")); 42 43 namespace { 44 45 class AMDGPUWaitSGPRHazards { 46 public: 47 const SIInstrInfo *TII; 48 const SIRegisterInfo *TRI; 49 const MachineRegisterInfo *MRI; 50 unsigned DsNopCount; 51 52 bool EnableSGPRHazardWaits; 53 bool CullSGPRHazardsOnFunctionBoundary; 54 bool CullSGPRHazardsAtMemWait; 55 unsigned CullSGPRHazardsMemWaitThreshold; 56 57 AMDGPUWaitSGPRHazards() {} 58 59 // Return the numeric ID 0-127 for a given SGPR. 60 static std::optional<unsigned> sgprNumber(Register Reg, 61 const SIRegisterInfo &TRI) { 62 switch (Reg) { 63 case AMDGPU::M0: 64 case AMDGPU::EXEC: 65 case AMDGPU::EXEC_LO: 66 case AMDGPU::EXEC_HI: 67 case AMDGPU::SGPR_NULL: 68 case AMDGPU::SGPR_NULL64: 69 return {}; 70 default: 71 break; 72 } 73 unsigned RegN = TRI.getHWRegIndex(Reg); 74 if (RegN > 127) 75 return {}; 76 return RegN; 77 } 78 79 static inline bool isVCC(Register Reg) { 80 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; 81 } 82 83 // Adjust global offsets for instructions bundled with S_GETPC_B64 after 84 // insertion of a new instruction. 85 static void updateGetPCBundle(MachineInstr *NewMI) { 86 if (!NewMI->isBundled()) 87 return; 88 89 // Find start of bundle. 90 auto I = NewMI->getIterator(); 91 while (I->isBundledWithPred()) 92 I--; 93 if (I->isBundle()) 94 I++; 95 96 // Bail if this is not an S_GETPC bundle. 97 if (I->getOpcode() != AMDGPU::S_GETPC_B64) 98 return; 99 100 // Update offsets of any references in the bundle. 101 const unsigned NewBytes = 4; 102 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 103 "Unexpected instruction insertion in bundle"); 104 auto NextMI = std::next(NewMI->getIterator()); 105 auto End = NewMI->getParent()->end(); 106 while (NextMI != End && NextMI->isBundledWithPred()) { 107 for (auto &Operand : NextMI->operands()) { 108 if (Operand.isGlobal()) 109 Operand.setOffset(Operand.getOffset() + NewBytes); 110 } 111 NextMI++; 112 } 113 } 114 115 struct HazardState { 116 static constexpr unsigned None = 0; 117 static constexpr unsigned SALU = (1 << 0); 118 static constexpr unsigned VALU = (1 << 1); 119 120 std::bitset<64> Tracked; // SGPR banks ever read by VALU 121 std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU 122 std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU 123 unsigned VCCHazard = None; // Source of current VCC writes 124 bool ActiveFlat = false; // Has unwaited flat instructions 125 126 bool merge(const HazardState &RHS) { 127 HazardState Orig(*this); 128 *this |= RHS; 129 return (*this != Orig); 130 } 131 132 bool operator==(const HazardState &RHS) const { 133 return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards && 134 VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard && 135 ActiveFlat == RHS.ActiveFlat; 136 } 137 138 bool operator!=(const HazardState &RHS) const { return !(*this == RHS); } 139 140 void operator|=(const HazardState &RHS) { 141 Tracked |= RHS.Tracked; 142 SALUHazards |= RHS.SALUHazards; 143 VALUHazards |= RHS.VALUHazards; 144 VCCHazard |= RHS.VCCHazard; 145 ActiveFlat |= RHS.ActiveFlat; 146 } 147 }; 148 149 struct BlockHazardState { 150 HazardState In; 151 HazardState Out; 152 }; 153 154 DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState; 155 156 static constexpr unsigned WAVE32_NOPS = 4; 157 static constexpr unsigned WAVE64_NOPS = 8; 158 159 void insertHazardCull(MachineBasicBlock &MBB, 160 MachineBasicBlock::instr_iterator &MI) { 161 assert(!MI->isBundled()); 162 unsigned Count = DsNopCount; 163 while (Count--) 164 BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP)); 165 } 166 167 unsigned mergeMasks(unsigned Mask1, unsigned Mask2) { 168 unsigned Mask = 0xffff; 169 Mask = AMDGPU::DepCtr::encodeFieldSaSdst( 170 Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1), 171 AMDGPU::DepCtr::decodeFieldSaSdst(Mask2))); 172 Mask = AMDGPU::DepCtr::encodeFieldVaVcc( 173 Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1), 174 AMDGPU::DepCtr::decodeFieldVaVcc(Mask2))); 175 Mask = AMDGPU::DepCtr::encodeFieldVmVsrc( 176 Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1), 177 AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2))); 178 Mask = AMDGPU::DepCtr::encodeFieldVaSdst( 179 Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1), 180 AMDGPU::DepCtr::decodeFieldVaSdst(Mask2))); 181 Mask = AMDGPU::DepCtr::encodeFieldVaVdst( 182 Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1), 183 AMDGPU::DepCtr::decodeFieldVaVdst(Mask2))); 184 Mask = AMDGPU::DepCtr::encodeFieldHoldCnt( 185 Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1), 186 AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2))); 187 Mask = AMDGPU::DepCtr::encodeFieldVaSsrc( 188 Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1), 189 AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2))); 190 return Mask; 191 } 192 193 bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI, 194 unsigned Mask) { 195 auto MBB = MI->getParent(); 196 if (MI == MBB->instr_begin()) 197 return false; 198 199 auto It = prev_nodbg(MI, MBB->instr_begin()); 200 if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR) 201 return false; 202 203 It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm())); 204 return true; 205 } 206 207 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 208 enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 }; 209 210 HazardState State = BlockState[&MBB].In; 211 SmallSet<Register, 8> SeenRegs; 212 bool Emitted = false; 213 unsigned DsNops = 0; 214 215 for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(), 216 E = MBB.instr_end(); 217 MI != E; ++MI) { 218 if (MI->isMetaInstruction()) 219 continue; 220 221 // Clear tracked SGPRs if sufficient DS_NOPs occur 222 if (MI->getOpcode() == AMDGPU::DS_NOP) { 223 if (++DsNops >= DsNopCount) 224 State.Tracked.reset(); 225 continue; 226 } 227 DsNops = 0; 228 229 // Snoop FLAT instructions to avoid adding culls before scratch/lds loads. 230 // Culls could be disproportionate in cost to load time. 231 if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI)) 232 State.ActiveFlat = true; 233 234 // SMEM or VMEM clears hazards 235 // FIXME: adapt to add FLAT without VALU (so !isLDSDMA())? 236 if ((SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI)) || 237 SIInstrInfo::isSMRD(*MI)) { 238 State.VCCHazard = HazardState::None; 239 State.SALUHazards.reset(); 240 State.VALUHazards.reset(); 241 continue; 242 } 243 244 // Existing S_WAITALU can clear hazards 245 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { 246 unsigned int Mask = MI->getOperand(0).getImm(); 247 if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0) 248 State.VCCHazard &= ~HazardState::VALU; 249 if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) { 250 State.SALUHazards.reset(); 251 State.VCCHazard &= ~HazardState::SALU; 252 } 253 if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0) 254 State.VALUHazards.reset(); 255 continue; 256 } 257 258 // Snoop counter waits to insert culls 259 if (CullSGPRHazardsAtMemWait && 260 (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT || 261 MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT || 262 MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) && 263 (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) && 264 (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) { 265 if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) { 266 State.ActiveFlat = false; 267 } else { 268 State.Tracked.reset(); 269 if (Emit) 270 insertHazardCull(MBB, MI); 271 continue; 272 } 273 } 274 275 // Process only VALUs and SALUs 276 bool IsVALU = SIInstrInfo::isVALU(*MI); 277 bool IsSALU = SIInstrInfo::isSALU(*MI); 278 if (!IsVALU && !IsSALU) 279 continue; 280 281 unsigned Wait = 0; 282 283 auto processOperand = [&](const MachineOperand &Op, bool IsUse) { 284 if (!Op.isReg()) 285 return; 286 Register Reg = Op.getReg(); 287 assert(!Op.getSubReg()); 288 if (!TRI->isSGPRReg(*MRI, Reg)) 289 return; 290 291 // Only visit each register once 292 if (!SeenRegs.insert(Reg).second) 293 return; 294 295 auto RegNumber = sgprNumber(Reg, *TRI); 296 if (!RegNumber) 297 return; 298 299 // Track SGPRs by pair -- numeric ID of an 64b SGPR pair. 300 // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc 301 unsigned RegN = *RegNumber; 302 unsigned PairN = (RegN >> 1) & 0x3f; 303 304 // Read/write of untracked register is safe; but must record any new 305 // reads. 306 if (!State.Tracked[PairN]) { 307 if (IsVALU && IsUse) 308 State.Tracked.set(PairN); 309 return; 310 } 311 312 uint8_t SGPRCount = 313 AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32; 314 315 if (IsUse) { 316 // SALU reading SGPR clears VALU hazards 317 if (IsSALU) { 318 if (isVCC(Reg)) { 319 if (State.VCCHazard & HazardState::VALU) 320 State.VCCHazard = HazardState::None; 321 } else { 322 State.VALUHazards.reset(); 323 } 324 } 325 // Compute required waits 326 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { 327 Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0; 328 Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0; 329 } 330 if (isVCC(Reg) && State.VCCHazard) { 331 // Note: it's possible for both SALU and VALU to exist if VCC 332 // was updated differently by merged predecessors. 333 if (State.VCCHazard & HazardState::SALU) 334 Wait |= WA_SALU; 335 if (State.VCCHazard & HazardState::VALU) 336 Wait |= WA_VCC; 337 } 338 } else { 339 // Update hazards 340 if (isVCC(Reg)) { 341 State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU; 342 } else { 343 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { 344 if (IsSALU) 345 State.SALUHazards.set(RegN + RegIdx); 346 else 347 State.VALUHazards.set(RegN + RegIdx); 348 } 349 } 350 } 351 }; 352 353 const bool IsSetPC = 354 (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) && 355 MI->getOpcode() != AMDGPU::S_ENDPGM && 356 MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED; 357 358 // Only consider implicit VCC specified by instruction descriptor. 359 const bool HasImplicitVCC = 360 llvm::any_of(MI->getDesc().implicit_uses(), isVCC) || 361 llvm::any_of(MI->getDesc().implicit_defs(), isVCC); 362 363 if (IsSetPC) { 364 // All SGPR writes before a call/return must be flushed as the 365 // callee/caller will not will not see the hazard chain. 366 if (State.VCCHazard & HazardState::VALU) 367 Wait |= WA_VCC; 368 if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU)) 369 Wait |= WA_SALU; 370 if (State.VALUHazards.any()) 371 Wait |= WA_VALU; 372 if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) { 373 State.Tracked.reset(); 374 if (Emit) 375 insertHazardCull(MBB, MI); 376 } 377 } else { 378 // Process uses to determine required wait. 379 SeenRegs.clear(); 380 for (const MachineOperand &Op : MI->all_uses()) { 381 if (Op.isImplicit() && 382 (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg()))) 383 continue; 384 processOperand(Op, true); 385 } 386 } 387 388 // Apply wait 389 if (Wait) { 390 unsigned Mask = 0xffff; 391 if (Wait & WA_VCC) { 392 State.VCCHazard &= ~HazardState::VALU; 393 Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0); 394 } 395 if (Wait & WA_SALU) { 396 State.SALUHazards.reset(); 397 State.VCCHazard &= ~HazardState::SALU; 398 Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0); 399 } 400 if (Wait & WA_VALU) { 401 State.VALUHazards.reset(); 402 Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0); 403 } 404 if (Emit) { 405 if (!mergeConsecutiveWaitAlus(MI, Mask)) { 406 auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(), 407 TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 408 .addImm(Mask); 409 updateGetPCBundle(NewMI); 410 } 411 Emitted = true; 412 } 413 } 414 415 // On return from a call SGPR state is unknown, so all potential hazards. 416 if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary) 417 State.Tracked.set(); 418 419 // Update hazards based on defs. 420 SeenRegs.clear(); 421 for (const MachineOperand &Op : MI->all_defs()) { 422 if (Op.isImplicit() && 423 (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg()))) 424 continue; 425 processOperand(Op, false); 426 } 427 } 428 429 BlockHazardState &BS = BlockState[&MBB]; 430 bool Changed = State != BS.Out; 431 if (Emit) { 432 assert(!Changed && "Hazard state should not change on emit pass"); 433 return Emitted; 434 } 435 if (Changed) 436 BS.Out = State; 437 return Changed; 438 } 439 440 bool run(MachineFunction &MF) { 441 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 442 if (!ST.hasVALUReadSGPRHazard()) 443 return false; 444 445 // Parse settings 446 EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits; 447 CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary; 448 CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait; 449 CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold; 450 451 if (!GlobalEnableSGPRHazardWaits.getNumOccurrences()) 452 EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger( 453 "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits); 454 if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences()) 455 CullSGPRHazardsOnFunctionBoundary = 456 MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull"); 457 if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences()) 458 CullSGPRHazardsAtMemWait = 459 MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull"); 460 if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences()) 461 CullSGPRHazardsMemWaitThreshold = 462 MF.getFunction().getFnAttributeAsParsedInteger( 463 "amdgpu-sgpr-hazard-mem-wait-cull-threshold", 464 CullSGPRHazardsMemWaitThreshold); 465 466 // Bail if disabled 467 if (!EnableSGPRHazardWaits) 468 return false; 469 470 TII = ST.getInstrInfo(); 471 TRI = ST.getRegisterInfo(); 472 MRI = &MF.getRegInfo(); 473 DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS; 474 475 auto CallingConv = MF.getFunction().getCallingConv(); 476 if (!AMDGPU::isEntryFunctionCC(CallingConv) && 477 !CullSGPRHazardsOnFunctionBoundary) { 478 // Callee must consider all SGPRs as tracked. 479 LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n"); 480 MachineBasicBlock &EntryBlock = MF.front(); 481 BlockState[&EntryBlock].In.Tracked.set(); 482 } 483 484 // Calculate the hazard state for each basic block. 485 // Iterate until a fixed point is reached. 486 // Fixed point is guaranteed as merge function only ever increases 487 // the hazard set, and all backedges will cause a merge. 488 // 489 // Note: we have to take care of the entry block as this technically 490 // has an edge from outside the function. Failure to treat this as 491 // a merge could prevent fixed point being reached. 492 SetVector<MachineBasicBlock *> Worklist; 493 for (auto &MBB : reverse(MF)) 494 Worklist.insert(&MBB); 495 while (!Worklist.empty()) { 496 auto &MBB = *Worklist.pop_back_val(); 497 bool Changed = runOnMachineBasicBlock(MBB, false); 498 if (Changed) { 499 // Note: take a copy of state here in case it is reallocated by map 500 HazardState NewState = BlockState[&MBB].Out; 501 // Propagate to all successor blocks 502 for (auto Succ : MBB.successors()) { 503 // We only need to merge hazards at CFG merge points. 504 auto &SuccState = BlockState[Succ]; 505 if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) { 506 if (SuccState.In != NewState) { 507 SuccState.In = NewState; 508 Worklist.insert(Succ); 509 } 510 } else if (SuccState.In.merge(NewState)) { 511 Worklist.insert(Succ); 512 } 513 } 514 } 515 } 516 517 LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n"); 518 519 // Final to emit wait instructions. 520 bool Changed = false; 521 for (auto &MBB : MF) 522 Changed |= runOnMachineBasicBlock(MBB, true); 523 524 BlockState.clear(); 525 return Changed; 526 } 527 }; 528 529 class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass { 530 public: 531 static char ID; 532 533 AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {} 534 535 bool runOnMachineFunction(MachineFunction &MF) override { 536 return AMDGPUWaitSGPRHazards().run(MF); 537 } 538 539 void getAnalysisUsage(AnalysisUsage &AU) const override { 540 AU.setPreservesCFG(); 541 MachineFunctionPass::getAnalysisUsage(AU); 542 } 543 }; 544 545 } // namespace 546 547 char AMDGPUWaitSGPRHazardsLegacy::ID = 0; 548 549 char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID; 550 551 INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE, 552 "AMDGPU Insert waits for SGPR read hazards", false, false) 553 554 PreservedAnalyses 555 AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF, 556 MachineFunctionAnalysisManager &MFAM) { 557 if (AMDGPUWaitSGPRHazards().run(MF)) 558 return PreservedAnalyses::none(); 559 return PreservedAnalyses::all(); 560 } 561