1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert wait instructions for memory reads and writes. 11 /// 12 /// Memory reads and writes are issued asynchronously, so we need to insert 13 /// S_WAITCNT instructions when we want to access any of their results or 14 /// overwrite any register that's used asynchronously. 15 /// 16 /// TODO: This pass currently keeps one timeline per hardware counter. A more 17 /// finely-grained approach that keeps one timeline per event type could 18 /// sometimes get away with generating weaker s_waitcnt instructions. For 19 /// example, when both SMEM and LDS are in flight and we need to wait for 20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, 21 /// but the pass will currently generate a conservative lgkmcnt(0) because 22 /// multiple event types are in flight. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPU.h" 27 #include "GCNSubtarget.h" 28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "llvm/ADT/MapVector.h" 32 #include "llvm/ADT/PostOrderIterator.h" 33 #include "llvm/ADT/Sequence.h" 34 #include "llvm/CodeGen/MachineLoopInfo.h" 35 #include "llvm/CodeGen/MachinePostDominators.h" 36 #include "llvm/InitializePasses.h" 37 #include "llvm/Support/DebugCounter.h" 38 #include "llvm/TargetParser/TargetParser.h" 39 using namespace llvm; 40 41 #define DEBUG_TYPE "si-insert-waitcnts" 42 43 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", 44 "Force emit s_waitcnt expcnt(0) instrs"); 45 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", 46 "Force emit s_waitcnt lgkmcnt(0) instrs"); 47 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", 48 "Force emit s_waitcnt vmcnt(0) instrs"); 49 50 static cl::opt<bool> ForceEmitZeroFlag( 51 "amdgpu-waitcnt-forcezero", 52 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), 53 cl::init(false), cl::Hidden); 54 55 namespace { 56 // Class of object that encapsulates latest instruction counter score 57 // associated with the operand. Used for determining whether 58 // s_waitcnt instruction needs to be emitted. 59 60 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; 61 } // namespace 62 63 namespace llvm { 64 template <> struct enum_iteration_traits<InstCounterType> { 65 static constexpr bool is_iterable = true; 66 }; 67 } // namespace llvm 68 69 namespace { 70 auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); } 71 72 using RegInterval = std::pair<int, int>; 73 74 struct HardwareLimits { 75 unsigned VmcntMax; 76 unsigned ExpcntMax; 77 unsigned LgkmcntMax; 78 unsigned VscntMax; 79 }; 80 81 struct RegisterEncoding { 82 unsigned VGPR0; 83 unsigned VGPRL; 84 unsigned SGPR0; 85 unsigned SGPRL; 86 }; 87 88 enum WaitEventType { 89 VMEM_ACCESS, // vector-memory read & write 90 VMEM_READ_ACCESS, // vector-memory read 91 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch 92 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch 93 LDS_ACCESS, // lds read & write 94 GDS_ACCESS, // gds read & write 95 SQ_MESSAGE, // send message 96 SMEM_ACCESS, // scalar-memory read & write 97 EXP_GPR_LOCK, // export holding on its data src 98 GDS_GPR_LOCK, // GDS holding on its data and addr src 99 EXP_POS_ACCESS, // write to export position 100 EXP_PARAM_ACCESS, // write to export parameter 101 VMW_GPR_LOCK, // vector-memory write holding on its data src 102 EXP_LDS_ACCESS, // read by ldsdir counting as export 103 NUM_WAIT_EVENTS, 104 }; 105 106 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { 107 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), 108 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | 109 (1 << SQ_MESSAGE), 110 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | 111 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), 112 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)}; 113 114 // The mapping is: 115 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 116 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 117 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 118 // We reserve a fixed number of VGPR slots in the scoring tables for 119 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 120 enum RegisterMapping { 121 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. 122 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. 123 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 124 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. 125 EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes. 126 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 127 }; 128 129 // Enumerate different types of result-returning VMEM operations. Although 130 // s_waitcnt orders them all with a single vmcnt counter, in the absence of 131 // s_waitcnt only instructions of the same VmemType are guaranteed to write 132 // their results in order -- so there is no need to insert an s_waitcnt between 133 // two instructions of the same type that write the same vgpr. 134 enum VmemType { 135 // BUF instructions and MIMG instructions without a sampler. 136 VMEM_NOSAMPLER, 137 // MIMG instructions with a sampler. 138 VMEM_SAMPLER, 139 // BVH instructions 140 VMEM_BVH 141 }; 142 143 static bool updateVMCntOnly(const MachineInstr &Inst) { 144 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || 145 SIInstrInfo::isFLATScratch(Inst); 146 } 147 148 VmemType getVmemType(const MachineInstr &Inst) { 149 assert(updateVMCntOnly(Inst)); 150 if (!SIInstrInfo::isMIMG(Inst)) 151 return VMEM_NOSAMPLER; 152 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); 153 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = 154 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 155 return BaseInfo->BVH ? VMEM_BVH 156 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; 157 } 158 159 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { 160 switch (T) { 161 case VM_CNT: 162 Wait.VmCnt = std::min(Wait.VmCnt, Count); 163 break; 164 case EXP_CNT: 165 Wait.ExpCnt = std::min(Wait.ExpCnt, Count); 166 break; 167 case LGKM_CNT: 168 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); 169 break; 170 case VS_CNT: 171 Wait.VsCnt = std::min(Wait.VsCnt, Count); 172 break; 173 default: 174 llvm_unreachable("bad InstCounterType"); 175 } 176 } 177 178 // This objects maintains the current score brackets of each wait counter, and 179 // a per-register scoreboard for each wait counter. 180 // 181 // We also maintain the latest score for every event type that can change the 182 // waitcnt in order to know if there are multiple types of events within 183 // the brackets. When multiple types of event happen in the bracket, 184 // wait count may get decreased out of order, therefore we need to put in 185 // "s_waitcnt 0" before use. 186 class WaitcntBrackets { 187 public: 188 WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits, 189 RegisterEncoding Encoding) 190 : ST(SubTarget), Limits(Limits), Encoding(Encoding) {} 191 192 unsigned getWaitCountMax(InstCounterType T) const { 193 switch (T) { 194 case VM_CNT: 195 return Limits.VmcntMax; 196 case LGKM_CNT: 197 return Limits.LgkmcntMax; 198 case EXP_CNT: 199 return Limits.ExpcntMax; 200 case VS_CNT: 201 return Limits.VscntMax; 202 default: 203 break; 204 } 205 return 0; 206 } 207 208 unsigned getScoreLB(InstCounterType T) const { 209 assert(T < NUM_INST_CNTS); 210 return ScoreLBs[T]; 211 } 212 213 unsigned getScoreUB(InstCounterType T) const { 214 assert(T < NUM_INST_CNTS); 215 return ScoreUBs[T]; 216 } 217 218 unsigned getScoreRange(InstCounterType T) const { 219 return getScoreUB(T) - getScoreLB(T); 220 } 221 222 // Mapping from event to counter. 223 InstCounterType eventCounter(WaitEventType E) const { 224 for (auto T : inst_counter_types()) { 225 if (WaitEventMaskForInst[T] & (1 << E)) 226 return T; 227 } 228 llvm_unreachable("event type has no associated counter"); 229 } 230 231 unsigned getRegScore(int GprNo, InstCounterType T) const { 232 if (GprNo < NUM_ALL_VGPRS) { 233 return VgprScores[T][GprNo]; 234 } 235 assert(T == LGKM_CNT); 236 return SgprScores[GprNo - NUM_ALL_VGPRS]; 237 } 238 239 bool merge(const WaitcntBrackets &Other); 240 241 RegInterval getRegInterval(const MachineInstr *MI, 242 const MachineRegisterInfo *MRI, 243 const SIRegisterInfo *TRI, unsigned OpNo) const; 244 245 bool counterOutOfOrder(InstCounterType T) const; 246 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; 247 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; 248 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; 249 void applyWaitcnt(const AMDGPU::Waitcnt &Wait); 250 void applyWaitcnt(InstCounterType T, unsigned Count); 251 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 252 const MachineRegisterInfo *MRI, WaitEventType E, 253 MachineInstr &MI); 254 255 unsigned hasPendingEvent() const { return PendingEvents; } 256 unsigned hasPendingEvent(WaitEventType E) const { 257 return PendingEvents & (1 << E); 258 } 259 unsigned hasPendingEvent(InstCounterType T) const { 260 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; 261 assert((HasPending != 0) == (getScoreRange(T) != 0)); 262 return HasPending; 263 } 264 265 bool hasMixedPendingEvents(InstCounterType T) const { 266 unsigned Events = hasPendingEvent(T); 267 // Return true if more than one bit is set in Events. 268 return Events & (Events - 1); 269 } 270 271 bool hasPendingFlat() const { 272 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && 273 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || 274 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && 275 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); 276 } 277 278 void setPendingFlat() { 279 LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; 280 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; 281 } 282 283 // Return true if there might be pending writes to the specified vgpr by VMEM 284 // instructions with types different from V. 285 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { 286 assert(GprNo < NUM_ALL_VGPRS); 287 return VgprVmemTypes[GprNo] & ~(1 << V); 288 } 289 290 void clearVgprVmemTypes(int GprNo) { 291 assert(GprNo < NUM_ALL_VGPRS); 292 VgprVmemTypes[GprNo] = 0; 293 } 294 295 void setNonKernelFunctionInitialState() { 296 setScoreUB(VS_CNT, getWaitCountMax(VS_CNT)); 297 PendingEvents |= WaitEventMaskForInst[VS_CNT]; 298 } 299 300 void print(raw_ostream &); 301 void dump() { print(dbgs()); } 302 303 private: 304 struct MergeInfo { 305 unsigned OldLB; 306 unsigned OtherLB; 307 unsigned MyShift; 308 unsigned OtherShift; 309 }; 310 static bool mergeScore(const MergeInfo &M, unsigned &Score, 311 unsigned OtherScore); 312 313 void setScoreLB(InstCounterType T, unsigned Val) { 314 assert(T < NUM_INST_CNTS); 315 ScoreLBs[T] = Val; 316 } 317 318 void setScoreUB(InstCounterType T, unsigned Val) { 319 assert(T < NUM_INST_CNTS); 320 ScoreUBs[T] = Val; 321 322 if (T != EXP_CNT) 323 return; 324 325 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) 326 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); 327 } 328 329 void setRegScore(int GprNo, InstCounterType T, unsigned Val) { 330 if (GprNo < NUM_ALL_VGPRS) { 331 VgprUB = std::max(VgprUB, GprNo); 332 VgprScores[T][GprNo] = Val; 333 } else { 334 assert(T == LGKM_CNT); 335 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); 336 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 337 } 338 } 339 340 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 341 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 342 unsigned OpNo, unsigned Val); 343 344 const GCNSubtarget *ST = nullptr; 345 HardwareLimits Limits = {}; 346 RegisterEncoding Encoding = {}; 347 unsigned ScoreLBs[NUM_INST_CNTS] = {0}; 348 unsigned ScoreUBs[NUM_INST_CNTS] = {0}; 349 unsigned PendingEvents = 0; 350 // Remember the last flat memory operation. 351 unsigned LastFlat[NUM_INST_CNTS] = {0}; 352 // wait_cnt scores for every vgpr. 353 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 354 int VgprUB = -1; 355 int SgprUB = -1; 356 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; 357 // Wait cnt scores for every sgpr, only lgkmcnt is relevant. 358 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 359 // Bitmask of the VmemTypes of VMEM instructions that might have a pending 360 // write to each vgpr. 361 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; 362 }; 363 364 class SIInsertWaitcnts : public MachineFunctionPass { 365 private: 366 const GCNSubtarget *ST = nullptr; 367 const SIInstrInfo *TII = nullptr; 368 const SIRegisterInfo *TRI = nullptr; 369 const MachineRegisterInfo *MRI = nullptr; 370 AMDGPU::IsaVersion IV; 371 372 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; 373 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; 374 MachineLoopInfo *MLI; 375 MachinePostDominatorTree *PDT; 376 377 struct BlockInfo { 378 std::unique_ptr<WaitcntBrackets> Incoming; 379 bool Dirty = true; 380 }; 381 382 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; 383 384 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 385 // because of amdgpu-waitcnt-forcezero flag 386 bool ForceEmitZeroWaitcnts; 387 bool ForceEmitWaitcnt[NUM_INST_CNTS]; 388 389 bool OptNone; 390 391 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS 392 // message. 393 DenseSet<MachineInstr *> ReleaseVGPRInsts; 394 395 public: 396 static char ID; 397 398 SIInsertWaitcnts() : MachineFunctionPass(ID) { 399 (void)ForceExpCounter; 400 (void)ForceLgkmCounter; 401 (void)ForceVMCounter; 402 } 403 404 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); 405 bool isPreheaderToFlush(MachineBasicBlock &MBB, 406 WaitcntBrackets &ScoreBrackets); 407 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; 408 bool runOnMachineFunction(MachineFunction &MF) override; 409 410 StringRef getPassName() const override { 411 return "SI insert wait instructions"; 412 } 413 414 void getAnalysisUsage(AnalysisUsage &AU) const override { 415 AU.setPreservesCFG(); 416 AU.addRequired<MachineLoopInfo>(); 417 AU.addRequired<MachinePostDominatorTree>(); 418 MachineFunctionPass::getAnalysisUsage(AU); 419 } 420 421 bool isForceEmitWaitcnt() const { 422 for (auto T : inst_counter_types()) 423 if (ForceEmitWaitcnt[T]) 424 return true; 425 return false; 426 } 427 428 void setForceEmitWaitcnt() { 429 // For non-debug builds, ForceEmitWaitcnt has been initialized to false; 430 // For debug builds, get the debug counter info and adjust if need be 431 #ifndef NDEBUG 432 if (DebugCounter::isCounterSet(ForceExpCounter) && 433 DebugCounter::shouldExecute(ForceExpCounter)) { 434 ForceEmitWaitcnt[EXP_CNT] = true; 435 } else { 436 ForceEmitWaitcnt[EXP_CNT] = false; 437 } 438 439 if (DebugCounter::isCounterSet(ForceLgkmCounter) && 440 DebugCounter::shouldExecute(ForceLgkmCounter)) { 441 ForceEmitWaitcnt[LGKM_CNT] = true; 442 } else { 443 ForceEmitWaitcnt[LGKM_CNT] = false; 444 } 445 446 if (DebugCounter::isCounterSet(ForceVMCounter) && 447 DebugCounter::shouldExecute(ForceVMCounter)) { 448 ForceEmitWaitcnt[VM_CNT] = true; 449 } else { 450 ForceEmitWaitcnt[VM_CNT] = false; 451 } 452 #endif // NDEBUG 453 } 454 455 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or 456 // FLAT instruction. 457 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { 458 assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); 459 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side 460 // these should use VM_CNT. 461 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) 462 return VMEM_ACCESS; 463 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) { 464 // FLAT and SCRATCH instructions may access scratch. Other VMEM 465 // instructions do not. 466 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) 467 return SCRATCH_WRITE_ACCESS; 468 return VMEM_WRITE_ACCESS; 469 } 470 return VMEM_READ_ACCESS; 471 } 472 473 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; 474 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 475 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; 476 bool generateWaitcntInstBefore(MachineInstr &MI, 477 WaitcntBrackets &ScoreBrackets, 478 MachineInstr *OldWaitcntInstr, 479 bool FlushVmCnt); 480 bool generateWaitcntBlockEnd(MachineBasicBlock &Block, 481 WaitcntBrackets &ScoreBrackets, 482 MachineInstr *OldWaitcntInstr); 483 bool generateWaitcnt(AMDGPU::Waitcnt Wait, 484 MachineBasicBlock::instr_iterator It, 485 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, 486 MachineInstr *OldWaitcntInstr); 487 void updateEventWaitcntAfter(MachineInstr &Inst, 488 WaitcntBrackets *ScoreBrackets); 489 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, 490 WaitcntBrackets &ScoreBrackets); 491 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 492 MachineInstr &OldWaitcntInstr, 493 AMDGPU::Waitcnt &Wait, 494 MachineBasicBlock::instr_iterator It) const; 495 496 // Transform a soft waitcnt into a normal one. 497 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; 498 }; 499 500 } // end anonymous namespace 501 502 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, 503 const MachineRegisterInfo *MRI, 504 const SIRegisterInfo *TRI, 505 unsigned OpNo) const { 506 const MachineOperand &Op = MI->getOperand(OpNo); 507 if (!TRI->isInAllocatableClass(Op.getReg())) 508 return {-1, -1}; 509 510 // A use via a PW operand does not need a waitcnt. 511 // A partial write is not a WAW. 512 assert(!Op.getSubReg() || !Op.isUndef()); 513 514 RegInterval Result; 515 516 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & 517 AMDGPU::HWEncoding::REG_IDX_MASK; 518 519 if (TRI->isVectorRegister(*MRI, Op.getReg())) { 520 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); 521 Result.first = Reg - Encoding.VGPR0; 522 if (TRI->isAGPR(*MRI, Op.getReg())) 523 Result.first += AGPR_OFFSET; 524 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 525 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { 526 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 527 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; 528 assert(Result.first >= NUM_ALL_VGPRS && 529 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 530 } 531 // TODO: Handle TTMP 532 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... 533 else 534 return {-1, -1}; 535 536 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); 537 unsigned Size = TRI->getRegSizeInBits(*RC); 538 Result.second = Result.first + ((Size + 16) / 32); 539 540 return Result; 541 } 542 543 void WaitcntBrackets::setExpScore(const MachineInstr *MI, 544 const SIInstrInfo *TII, 545 const SIRegisterInfo *TRI, 546 const MachineRegisterInfo *MRI, unsigned OpNo, 547 unsigned Val) { 548 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo); 549 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); 550 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 551 setRegScore(RegNo, EXP_CNT, Val); 552 } 553 } 554 555 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 556 const SIRegisterInfo *TRI, 557 const MachineRegisterInfo *MRI, 558 WaitEventType E, MachineInstr &Inst) { 559 InstCounterType T = eventCounter(E); 560 unsigned CurrScore = getScoreUB(T) + 1; 561 if (CurrScore == 0) 562 report_fatal_error("InsertWaitcnt score wraparound"); 563 // PendingEvents and ScoreUB need to be update regardless if this event 564 // changes the score of a register or not. 565 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 566 PendingEvents |= 1 << E; 567 setScoreUB(T, CurrScore); 568 569 if (T == EXP_CNT) { 570 // Put score on the source vgprs. If this is a store, just use those 571 // specific register(s). 572 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 573 int AddrOpIdx = 574 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); 575 // All GDS operations must protect their address register (same as 576 // export.) 577 if (AddrOpIdx != -1) { 578 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); 579 } 580 581 if (Inst.mayStore()) { 582 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) { 583 setExpScore( 584 &Inst, TII, TRI, MRI, 585 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 586 CurrScore); 587 } 588 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) { 589 setExpScore(&Inst, TII, TRI, MRI, 590 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 591 AMDGPU::OpName::data1), 592 CurrScore); 593 } 594 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && 595 Inst.getOpcode() != AMDGPU::DS_APPEND && 596 Inst.getOpcode() != AMDGPU::DS_CONSUME && 597 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 598 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 599 const MachineOperand &Op = Inst.getOperand(I); 600 if (Op.isReg() && !Op.isDef() && 601 TRI->isVectorRegister(*MRI, Op.getReg())) { 602 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 603 } 604 } 605 } 606 } else if (TII->isFLAT(Inst)) { 607 if (Inst.mayStore()) { 608 setExpScore( 609 &Inst, TII, TRI, MRI, 610 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 611 CurrScore); 612 } else if (SIInstrInfo::isAtomicRet(Inst)) { 613 setExpScore( 614 &Inst, TII, TRI, MRI, 615 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 616 CurrScore); 617 } 618 } else if (TII->isMIMG(Inst)) { 619 if (Inst.mayStore()) { 620 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 621 } else if (SIInstrInfo::isAtomicRet(Inst)) { 622 setExpScore( 623 &Inst, TII, TRI, MRI, 624 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 625 CurrScore); 626 } 627 } else if (TII->isMTBUF(Inst)) { 628 if (Inst.mayStore()) { 629 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 630 } 631 } else if (TII->isMUBUF(Inst)) { 632 if (Inst.mayStore()) { 633 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 634 } else if (SIInstrInfo::isAtomicRet(Inst)) { 635 setExpScore( 636 &Inst, TII, TRI, MRI, 637 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 638 CurrScore); 639 } 640 } else if (TII->isLDSDIR(Inst)) { 641 // LDSDIR instructions attach the score to the destination. 642 setExpScore( 643 &Inst, TII, TRI, MRI, 644 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), 645 CurrScore); 646 } else { 647 if (TII->isEXP(Inst)) { 648 // For export the destination registers are really temps that 649 // can be used as the actual source after export patching, so 650 // we need to treat them like sources and set the EXP_CNT 651 // score. 652 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 653 MachineOperand &DefMO = Inst.getOperand(I); 654 if (DefMO.isReg() && DefMO.isDef() && 655 TRI->isVGPR(*MRI, DefMO.getReg())) { 656 setRegScore( 657 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), 658 EXP_CNT, CurrScore); 659 } 660 } 661 } 662 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 663 MachineOperand &MO = Inst.getOperand(I); 664 if (MO.isReg() && !MO.isDef() && 665 TRI->isVectorRegister(*MRI, MO.getReg())) { 666 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 667 } 668 } 669 } 670 #if 0 // TODO: check if this is handled by MUBUF code above. 671 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || 672 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || 673 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { 674 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); 675 unsigned OpNo;//TODO: find the OpNo for this operand; 676 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo); 677 for (int RegNo = Interval.first; RegNo < Interval.second; 678 ++RegNo) { 679 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); 680 } 681 #endif 682 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { 683 // Match the score to the destination registers. 684 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 685 auto &Op = Inst.getOperand(I); 686 if (!Op.isReg() || !Op.isDef()) 687 continue; 688 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I); 689 if (T == VM_CNT) { 690 if (Interval.first >= NUM_ALL_VGPRS) 691 continue; 692 if (updateVMCntOnly(Inst)) { 693 // updateVMCntOnly should only leave us with VGPRs 694 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR 695 // defs. That's required for a sane index into `VgprMemTypes` below 696 assert(TRI->isVectorRegister(*MRI, Op.getReg())); 697 VmemType V = getVmemType(Inst); 698 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) 699 VgprVmemTypes[RegNo] |= 1 << V; 700 } 701 } 702 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 703 setRegScore(RegNo, T, CurrScore); 704 } 705 } 706 if (Inst.mayStore() && 707 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { 708 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS 709 // written can be accessed. A load from LDS to VMEM does not need a wait. 710 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 711 } 712 } 713 } 714 715 void WaitcntBrackets::print(raw_ostream &OS) { 716 OS << '\n'; 717 for (auto T : inst_counter_types()) { 718 unsigned SR = getScoreRange(T); 719 720 switch (T) { 721 case VM_CNT: 722 OS << " VM_CNT(" << SR << "): "; 723 break; 724 case LGKM_CNT: 725 OS << " LGKM_CNT(" << SR << "): "; 726 break; 727 case EXP_CNT: 728 OS << " EXP_CNT(" << SR << "): "; 729 break; 730 case VS_CNT: 731 OS << " VS_CNT(" << SR << "): "; 732 break; 733 default: 734 OS << " UNKNOWN(" << SR << "): "; 735 break; 736 } 737 738 if (SR != 0) { 739 // Print vgpr scores. 740 unsigned LB = getScoreLB(T); 741 742 for (int J = 0; J <= VgprUB; J++) { 743 unsigned RegScore = getRegScore(J, T); 744 if (RegScore <= LB) 745 continue; 746 unsigned RelScore = RegScore - LB - 1; 747 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 748 OS << RelScore << ":v" << J << " "; 749 } else { 750 OS << RelScore << ":ds "; 751 } 752 } 753 // Also need to print sgpr scores for lgkm_cnt. 754 if (T == LGKM_CNT) { 755 for (int J = 0; J <= SgprUB; J++) { 756 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 757 if (RegScore <= LB) 758 continue; 759 unsigned RelScore = RegScore - LB - 1; 760 OS << RelScore << ":s" << J << " "; 761 } 762 } 763 } 764 OS << '\n'; 765 } 766 OS << '\n'; 767 } 768 769 /// Simplify the waitcnt, in the sense of removing redundant counts, and return 770 /// whether a waitcnt instruction is needed at all. 771 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { 772 simplifyWaitcnt(VM_CNT, Wait.VmCnt); 773 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); 774 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); 775 simplifyWaitcnt(VS_CNT, Wait.VsCnt); 776 } 777 778 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, 779 unsigned &Count) const { 780 // The number of outstanding events for this type, T, can be calculated 781 // as (UB - LB). If the current Count is greater than or equal to the number 782 // of outstanding events, then the wait for this counter is redundant. 783 if (Count >= getScoreRange(T)) 784 Count = ~0u; 785 } 786 787 void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, 788 AMDGPU::Waitcnt &Wait) const { 789 unsigned ScoreToWait = getRegScore(RegNo, T); 790 791 // If the score of src_operand falls within the bracket, we need an 792 // s_waitcnt instruction. 793 const unsigned LB = getScoreLB(T); 794 const unsigned UB = getScoreUB(T); 795 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 796 if ((T == VM_CNT || T == LGKM_CNT) && 797 hasPendingFlat() && 798 !ST->hasFlatLgkmVMemCountInOrder()) { 799 // If there is a pending FLAT operation, and this is a VMem or LGKM 800 // waitcnt and the target can report early completion, then we need 801 // to force a waitcnt 0. 802 addWait(Wait, T, 0); 803 } else if (counterOutOfOrder(T)) { 804 // Counter can get decremented out-of-order when there 805 // are multiple types event in the bracket. Also emit an s_wait counter 806 // with a conservative value of 0 for the counter. 807 addWait(Wait, T, 0); 808 } else { 809 // If a counter has been maxed out avoid overflow by waiting for 810 // MAX(CounterType) - 1 instead. 811 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); 812 addWait(Wait, T, NeededWait); 813 } 814 } 815 } 816 817 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { 818 applyWaitcnt(VM_CNT, Wait.VmCnt); 819 applyWaitcnt(EXP_CNT, Wait.ExpCnt); 820 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); 821 applyWaitcnt(VS_CNT, Wait.VsCnt); 822 } 823 824 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { 825 const unsigned UB = getScoreUB(T); 826 if (Count >= UB) 827 return; 828 if (Count != 0) { 829 if (counterOutOfOrder(T)) 830 return; 831 setScoreLB(T, std::max(getScoreLB(T), UB - Count)); 832 } else { 833 setScoreLB(T, UB); 834 PendingEvents &= ~WaitEventMaskForInst[T]; 835 } 836 } 837 838 // Where there are multiple types of event in the bracket of a counter, 839 // the decrement may go out of order. 840 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { 841 // Scalar memory read always can go out of order. 842 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) 843 return true; 844 return hasMixedPendingEvents(T); 845 } 846 847 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 848 false) 849 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 850 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 851 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 852 false) 853 854 char SIInsertWaitcnts::ID = 0; 855 856 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 857 858 FunctionPass *llvm::createSIInsertWaitcntsPass() { 859 return new SIInsertWaitcnts(); 860 } 861 862 static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, 863 unsigned NewEnc) { 864 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 865 assert(OpIdx >= 0); 866 867 MachineOperand &MO = MI.getOperand(OpIdx); 868 869 if (NewEnc == MO.getImm()) 870 return false; 871 872 MO.setImm(NewEnc); 873 return true; 874 } 875 876 bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { 877 unsigned Opcode = Waitcnt->getOpcode(); 878 if (!SIInstrInfo::isSoftWaitcnt(Opcode)) 879 return false; 880 881 Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); 882 return true; 883 } 884 885 /// Combine consecutive waitcnt instructions that precede \p It and follow 886 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added 887 /// by previous passes. Currently this pass conservatively assumes that these 888 /// preexisting waitcnt are required for correctness. 889 bool SIInsertWaitcnts::applyPreexistingWaitcnt( 890 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 891 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 892 bool Modified = false; 893 MachineInstr *WaitcntInstr = nullptr; 894 MachineInstr *WaitcntVsCntInstr = nullptr; 895 896 for (auto &II : 897 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 898 if (II.isMetaInstruction()) 899 continue; 900 901 unsigned Opcode = II.getOpcode(); 902 bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode); 903 904 if (SIInstrInfo::isWaitcnt(Opcode)) { 905 // Update required wait count. If this is a soft waitcnt (= it was added 906 // by an earlier pass), it may be entirely removed. 907 unsigned IEnc = II.getOperand(0).getImm(); 908 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); 909 if (IsSoft) 910 ScoreBrackets.simplifyWaitcnt(OldWait); 911 Wait = Wait.combined(OldWait); 912 913 // Merge consecutive waitcnt of the same type by erasing multiples. 914 if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) { 915 II.eraseFromParent(); 916 Modified = true; 917 } else 918 WaitcntInstr = &II; 919 920 } else { 921 assert(SIInstrInfo::isWaitcntVsCnt(Opcode)); 922 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 923 924 unsigned OldVSCnt = 925 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 926 if (IsSoft) 927 ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt); 928 Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); 929 930 if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) { 931 II.eraseFromParent(); 932 Modified = true; 933 } else 934 WaitcntVsCntInstr = &II; 935 } 936 } 937 938 // Updated encoding of merged waitcnt with the required wait. 939 if (WaitcntInstr) { 940 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, 941 AMDGPU::encodeWaitcnt(IV, Wait)); 942 Modified |= promoteSoftWaitCnt(WaitcntInstr); 943 944 ScoreBrackets.applyWaitcnt(Wait); 945 Wait.VmCnt = ~0u; 946 Wait.LgkmCnt = ~0u; 947 Wait.ExpCnt = ~0u; 948 949 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 950 ? dbgs() 951 << "applyPreexistingWaitcnt\n" 952 << "New Instr at block end: " << *WaitcntInstr << '\n' 953 : dbgs() << "applyPreexistingWaitcnt\n" 954 << "Old Instr: " << *It 955 << "New Instr: " << *WaitcntInstr << '\n'); 956 } 957 958 if (WaitcntVsCntInstr) { 959 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, 960 AMDGPU::OpName::simm16, Wait.VsCnt); 961 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); 962 ScoreBrackets.applyWaitcnt(Wait); 963 Wait.VsCnt = ~0u; 964 965 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 966 ? dbgs() << "applyPreexistingWaitcnt\n" 967 << "New Instr at block end: " << *WaitcntVsCntInstr 968 << '\n' 969 : dbgs() << "applyPreexistingWaitcnt\n" 970 << "Old Instr: " << *It 971 << "New Instr: " << *WaitcntVsCntInstr << '\n'); 972 } 973 974 return Modified; 975 } 976 977 static bool readsVCCZ(const MachineInstr &MI) { 978 unsigned Opc = MI.getOpcode(); 979 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 980 !MI.getOperand(1).isUndef(); 981 } 982 983 /// \returns true if the callee inserts an s_waitcnt 0 on function entry. 984 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { 985 // Currently all conventions wait, but this may not always be the case. 986 // 987 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make 988 // senses to omit the wait and do it in the caller. 989 return true; 990 } 991 992 /// \returns true if the callee is expected to wait for any outstanding waits 993 /// before returning. 994 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { 995 return true; 996 } 997 998 /// Generate s_waitcnt instruction to be placed before cur_Inst. 999 /// Instructions of a given type are returned in order, 1000 /// but instructions of different types can complete out of order. 1001 /// We rely on this in-order completion 1002 /// and simply assign a score to the memory access instructions. 1003 /// We keep track of the active "score bracket" to determine 1004 /// if an access of a memory read requires an s_waitcnt 1005 /// and if so what the value of each counter is. 1006 /// The "score bracket" is bound by the lower bound and upper bound 1007 /// scores (*_score_LB and *_score_ub respectively). 1008 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to 1009 /// flush the vmcnt counter here. 1010 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, 1011 WaitcntBrackets &ScoreBrackets, 1012 MachineInstr *OldWaitcntInstr, 1013 bool FlushVmCnt) { 1014 setForceEmitWaitcnt(); 1015 1016 if (MI.isMetaInstruction()) 1017 return false; 1018 1019 AMDGPU::Waitcnt Wait; 1020 1021 // FIXME: This should have already been handled by the memory legalizer. 1022 // Removing this currently doesn't affect any lit tests, but we need to 1023 // verify that nothing was relying on this. The number of buffer invalidates 1024 // being handled here should not be expanded. 1025 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 1026 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 1027 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || 1028 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || 1029 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { 1030 Wait.VmCnt = 0; 1031 } 1032 1033 // All waits must be resolved at call return. 1034 // NOTE: this could be improved with knowledge of all call sites or 1035 // with knowledge of the called routines. 1036 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1037 MI.getOpcode() == AMDGPU::SI_RETURN || 1038 MI.getOpcode() == AMDGPU::S_SETPC_B64_return || 1039 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { 1040 Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt()); 1041 } 1042 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM 1043 // stores. In this case it can be useful to send a message to explicitly 1044 // release all VGPRs before the stores have completed, but it is only safe to 1045 // do this if there are no outstanding scratch stores. 1046 else if (MI.getOpcode() == AMDGPU::S_ENDPGM || 1047 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { 1048 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && 1049 ScoreBrackets.getScoreRange(VS_CNT) != 0 && 1050 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) 1051 ReleaseVGPRInsts.insert(&MI); 1052 } 1053 // Resolve vm waits before gs-done. 1054 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 1055 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 1056 ST->hasLegacyGeometry() && 1057 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == 1058 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { 1059 Wait.VmCnt = 0; 1060 } 1061 #if 0 // TODO: the following blocks of logic when we have fence. 1062 else if (MI.getOpcode() == SC_FENCE) { 1063 const unsigned int group_size = 1064 context->shader_info->GetMaxThreadGroupSize(); 1065 // group_size == 0 means thread group size is unknown at compile time 1066 const bool group_is_multi_wave = 1067 (group_size == 0 || group_size > target_info->GetWaveFrontSize()); 1068 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); 1069 1070 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { 1071 SCRegType src_type = Inst->GetSrcType(i); 1072 switch (src_type) { 1073 case SCMEM_LDS: 1074 if (group_is_multi_wave || 1075 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { 1076 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 1077 ScoreBrackets->getScoreUB(LGKM_CNT)); 1078 // LDS may have to wait for VM_CNT after buffer load to LDS 1079 if (target_info->HasBufferLoadToLDS()) { 1080 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 1081 ScoreBrackets->getScoreUB(VM_CNT)); 1082 } 1083 } 1084 break; 1085 1086 case SCMEM_GDS: 1087 if (group_is_multi_wave || fence_is_global) { 1088 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 1089 ScoreBrackets->getScoreUB(EXP_CNT)); 1090 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 1091 ScoreBrackets->getScoreUB(LGKM_CNT)); 1092 } 1093 break; 1094 1095 case SCMEM_UAV: 1096 case SCMEM_TFBUF: 1097 case SCMEM_RING: 1098 case SCMEM_SCATTER: 1099 if (group_is_multi_wave || fence_is_global) { 1100 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 1101 ScoreBrackets->getScoreUB(EXP_CNT)); 1102 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 1103 ScoreBrackets->getScoreUB(VM_CNT)); 1104 } 1105 break; 1106 1107 case SCMEM_SCRATCH: 1108 default: 1109 break; 1110 } 1111 } 1112 } 1113 #endif 1114 1115 // Export & GDS instructions do not read the EXEC mask until after the export 1116 // is granted (which can occur well after the instruction is issued). 1117 // The shader program must flush all EXP operations on the export-count 1118 // before overwriting the EXEC mask. 1119 else { 1120 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 1121 // Export and GDS are tracked individually, either may trigger a waitcnt 1122 // for EXEC. 1123 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || 1124 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || 1125 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || 1126 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { 1127 Wait.ExpCnt = 0; 1128 } 1129 } 1130 1131 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { 1132 // The function is going to insert a wait on everything in its prolog. 1133 // This still needs to be careful if the call target is a load (e.g. a GOT 1134 // load). We also need to check WAW dependency with saved PC. 1135 Wait = AMDGPU::Waitcnt(); 1136 1137 int CallAddrOpIdx = 1138 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 1139 1140 if (MI.getOperand(CallAddrOpIdx).isReg()) { 1141 RegInterval CallAddrOpInterval = 1142 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx); 1143 1144 for (int RegNo = CallAddrOpInterval.first; 1145 RegNo < CallAddrOpInterval.second; ++RegNo) 1146 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); 1147 1148 int RtnAddrOpIdx = 1149 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 1150 if (RtnAddrOpIdx != -1) { 1151 RegInterval RtnAddrOpInterval = 1152 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx); 1153 1154 for (int RegNo = RtnAddrOpInterval.first; 1155 RegNo < RtnAddrOpInterval.second; ++RegNo) 1156 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); 1157 } 1158 } 1159 } else { 1160 // FIXME: Should not be relying on memoperands. 1161 // Look at the source operands of every instruction to see if 1162 // any of them results from a previous memory operation that affects 1163 // its current usage. If so, an s_waitcnt instruction needs to be 1164 // emitted. 1165 // If the source operand was defined by a load, add the s_waitcnt 1166 // instruction. 1167 // 1168 // Two cases are handled for destination operands: 1169 // 1) If the destination operand was defined by a load, add the s_waitcnt 1170 // instruction to guarantee the right WAW order. 1171 // 2) If a destination operand that was used by a recent export/store ins, 1172 // add s_waitcnt on exp_cnt to guarantee the WAR order. 1173 for (const MachineMemOperand *Memop : MI.memoperands()) { 1174 const Value *Ptr = Memop->getValue(); 1175 if (Memop->isStore() && SLoadAddresses.count(Ptr)) { 1176 addWait(Wait, LGKM_CNT, 0); 1177 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) 1178 SLoadAddresses.erase(Ptr); 1179 } 1180 unsigned AS = Memop->getAddrSpace(); 1181 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) 1182 continue; 1183 // No need to wait before load from VMEM to LDS. 1184 if (TII->mayWriteLDSThroughDMA(MI)) 1185 continue; 1186 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 1187 // VM_CNT is only relevant to vgpr or LDS. 1188 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); 1189 if (Memop->isStore()) { 1190 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1191 } 1192 } 1193 1194 // Loop over use and def operands. 1195 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 1196 MachineOperand &Op = MI.getOperand(I); 1197 if (!Op.isReg()) 1198 continue; 1199 1200 // If the instruction does not read tied source, skip the operand. 1201 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) 1202 continue; 1203 1204 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I); 1205 1206 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); 1207 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1208 if (IsVGPR) { 1209 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the 1210 // previous write and this write are the same type of VMEM 1211 // instruction, in which case they're guaranteed to write their 1212 // results in order anyway. 1213 if (Op.isUse() || !updateVMCntOnly(MI) || 1214 ScoreBrackets.hasOtherPendingVmemTypes(RegNo, 1215 getVmemType(MI))) { 1216 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); 1217 ScoreBrackets.clearVgprVmemTypes(RegNo); 1218 } 1219 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { 1220 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1221 } 1222 } 1223 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); 1224 } 1225 } 1226 } 1227 } 1228 1229 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does 1230 // not, we need to ensure the subtarget is capable of backing off barrier 1231 // instructions in case there are any outstanding memory operations that may 1232 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. 1233 if (MI.getOpcode() == AMDGPU::S_BARRIER && 1234 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { 1235 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); 1236 } 1237 1238 // TODO: Remove this work-around, enable the assert for Bug 457939 1239 // after fixing the scheduler. Also, the Shader Compiler code is 1240 // independent of target. 1241 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { 1242 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1243 Wait.LgkmCnt = 0; 1244 } 1245 } 1246 1247 // Verify that the wait is actually needed. 1248 ScoreBrackets.simplifyWaitcnt(Wait); 1249 1250 if (ForceEmitZeroWaitcnts) 1251 Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(); 1252 1253 if (ForceEmitWaitcnt[VM_CNT]) 1254 Wait.VmCnt = 0; 1255 if (ForceEmitWaitcnt[EXP_CNT]) 1256 Wait.ExpCnt = 0; 1257 if (ForceEmitWaitcnt[LGKM_CNT]) 1258 Wait.LgkmCnt = 0; 1259 1260 if (FlushVmCnt) { 1261 if (ScoreBrackets.hasPendingEvent(VM_CNT)) 1262 Wait.VmCnt = 0; 1263 } 1264 1265 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, 1266 OldWaitcntInstr); 1267 } 1268 1269 // Add a waitcnt to flush the vmcnt counter at the end of the given block if 1270 // needed. 1271 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, 1272 WaitcntBrackets &ScoreBrackets, 1273 MachineInstr *OldWaitcntInstr) { 1274 AMDGPU::Waitcnt Wait; 1275 1276 if (!ScoreBrackets.hasPendingEvent(VM_CNT)) 1277 return false; 1278 1279 Wait.VmCnt = 0; 1280 1281 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, 1282 OldWaitcntInstr); 1283 } 1284 1285 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, 1286 MachineBasicBlock::instr_iterator It, 1287 MachineBasicBlock &Block, 1288 WaitcntBrackets &ScoreBrackets, 1289 MachineInstr *OldWaitcntInstr) { 1290 bool Modified = false; 1291 const DebugLoc &DL = Block.findDebugLoc(It); 1292 1293 if (OldWaitcntInstr) 1294 // Try to merge the required wait with preexisting waitcnt instructions. 1295 // Also erase redundant waitcnt. 1296 Modified = 1297 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); 1298 else 1299 ScoreBrackets.applyWaitcnt(Wait); 1300 1301 // ExpCnt can be merged into VINTERP. 1302 if (Wait.ExpCnt != ~0u && It != Block.instr_end() && 1303 SIInstrInfo::isVINTERP(*It)) { 1304 MachineOperand *WaitExp = 1305 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); 1306 if (Wait.ExpCnt < WaitExp->getImm()) { 1307 WaitExp->setImm(Wait.ExpCnt); 1308 Modified = true; 1309 } 1310 Wait.ExpCnt = ~0u; 1311 1312 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" 1313 << "Update Instr: " << *It); 1314 } 1315 1316 // Build new waitcnt instructions unless no wait is needed or the old waitcnt 1317 // instruction was modified to handle the required wait. 1318 if (Wait.hasWaitExceptVsCnt()) { 1319 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); 1320 [[maybe_unused]] auto SWaitInst = 1321 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); 1322 Modified = true; 1323 1324 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1325 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1326 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1327 } 1328 1329 if (Wait.hasWaitVsCnt()) { 1330 assert(ST->hasVscnt()); 1331 1332 [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1333 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1334 .addImm(Wait.VsCnt); 1335 Modified = true; 1336 1337 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1338 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1339 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1340 } 1341 return Modified; 1342 } 1343 1344 // This is a flat memory operation. Check to see if it has memory tokens other 1345 // than LDS. Other address spaces supported by flat memory operations involve 1346 // global memory. 1347 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { 1348 assert(TII->isFLAT(MI)); 1349 1350 // All flat instructions use the VMEM counter. 1351 assert(TII->usesVM_CNT(MI)); 1352 1353 // If there are no memory operands then conservatively assume the flat 1354 // operation may access VMEM. 1355 if (MI.memoperands_empty()) 1356 return true; 1357 1358 // See if any memory operand specifies an address space that involves VMEM. 1359 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces 1360 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION 1361 // (GDS) address space is not supported by flat operations. Therefore, simply 1362 // return true unless only the LDS address space is found. 1363 for (const MachineMemOperand *Memop : MI.memoperands()) { 1364 unsigned AS = Memop->getAddrSpace(); 1365 assert(AS != AMDGPUAS::REGION_ADDRESS); 1366 if (AS != AMDGPUAS::LOCAL_ADDRESS) 1367 return true; 1368 } 1369 1370 return false; 1371 } 1372 1373 // This is a flat memory operation. Check to see if it has memory tokens for 1374 // either LDS or FLAT. 1375 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 1376 assert(TII->isFLAT(MI)); 1377 1378 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. 1379 if (!TII->usesLGKM_CNT(MI)) 1380 return false; 1381 1382 // If in tgsplit mode then there can be no use of LDS. 1383 if (ST->isTgSplitEnabled()) 1384 return false; 1385 1386 // If there are no memory operands then conservatively assume the flat 1387 // operation may access LDS. 1388 if (MI.memoperands_empty()) 1389 return true; 1390 1391 // See if any memory operand specifies an address space that involves LDS. 1392 for (const MachineMemOperand *Memop : MI.memoperands()) { 1393 unsigned AS = Memop->getAddrSpace(); 1394 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1395 return true; 1396 } 1397 1398 return false; 1399 } 1400 1401 // This is a flat memory operation. Check to see if it has memory tokens for 1402 // either scratch or FLAT. 1403 bool SIInsertWaitcnts::mayAccessScratchThroughFlat( 1404 const MachineInstr &MI) const { 1405 assert(TII->isFLAT(MI)); 1406 1407 // SCRATCH instructions always access scratch. 1408 if (TII->isFLATScratch(MI)) 1409 return true; 1410 1411 // GLOBAL instructions never access scratch. 1412 if (TII->isFLATGlobal(MI)) 1413 return false; 1414 1415 // If there are no memory operands then conservatively assume the flat 1416 // operation may access scratch. 1417 if (MI.memoperands_empty()) 1418 return true; 1419 1420 // See if any memory operand specifies an address space that involves scratch. 1421 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { 1422 unsigned AS = Memop->getAddrSpace(); 1423 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 1424 }); 1425 } 1426 1427 static bool isCacheInvOrWBInst(MachineInstr &Inst) { 1428 auto Opc = Inst.getOpcode(); 1429 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || 1430 Opc == AMDGPU::GLOBAL_WBINV; 1431 } 1432 1433 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, 1434 WaitcntBrackets *ScoreBrackets) { 1435 // Now look at the instruction opcode. If it is a memory access 1436 // instruction, update the upper-bound of the appropriate counter's 1437 // bracket and the destination operand scores. 1438 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. 1439 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 1440 if (TII->isAlwaysGDS(Inst.getOpcode()) || 1441 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 1442 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 1443 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 1444 } else { 1445 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1446 } 1447 } else if (TII->isFLAT(Inst)) { 1448 // TODO: Track this properly. 1449 if (isCacheInvOrWBInst(Inst)) 1450 return; 1451 1452 assert(Inst.mayLoadOrStore()); 1453 1454 int FlatASCount = 0; 1455 1456 if (mayAccessVMEMThroughFlat(Inst)) { 1457 ++FlatASCount; 1458 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 1459 Inst); 1460 } 1461 1462 if (mayAccessLDSThroughFlat(Inst)) { 1463 ++FlatASCount; 1464 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1465 } 1466 1467 // A Flat memory operation must access at least one address space. 1468 assert(FlatASCount); 1469 1470 // This is a flat memory operation that access both VMEM and LDS, so note it 1471 // - it will require that both the VM and LGKM be flushed to zero if it is 1472 // pending when a VM or LGKM dependency occurs. 1473 if (FlatASCount > 1) 1474 ScoreBrackets->setPendingFlat(); 1475 } else if (SIInstrInfo::isVMEM(Inst) && 1476 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { 1477 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 1478 Inst); 1479 1480 if (ST->vmemWriteNeedsExpWaitcnt() && 1481 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { 1482 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 1483 } 1484 } else if (TII->isSMRD(Inst)) { 1485 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1486 } else if (Inst.isCall()) { 1487 if (callWaitsOnFunctionReturn(Inst)) { 1488 // Act as a wait on everything 1489 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); 1490 } else { 1491 // May need to way wait for anything. 1492 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); 1493 } 1494 } else if (SIInstrInfo::isLDSDIR(Inst)) { 1495 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); 1496 } else if (TII->isVINTERP(Inst)) { 1497 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); 1498 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); 1499 } else if (SIInstrInfo::isEXP(Inst)) { 1500 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 1501 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) 1502 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 1503 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) 1504 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 1505 else 1506 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 1507 } else { 1508 switch (Inst.getOpcode()) { 1509 case AMDGPU::S_SENDMSG: 1510 case AMDGPU::S_SENDMSG_RTN_B32: 1511 case AMDGPU::S_SENDMSG_RTN_B64: 1512 case AMDGPU::S_SENDMSGHALT: 1513 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 1514 break; 1515 case AMDGPU::S_MEMTIME: 1516 case AMDGPU::S_MEMREALTIME: 1517 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: 1518 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: 1519 case AMDGPU::S_BARRIER_LEAVE: 1520 case AMDGPU::S_GET_BARRIER_STATE_M0: 1521 case AMDGPU::S_GET_BARRIER_STATE_IMM: 1522 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1523 break; 1524 } 1525 } 1526 } 1527 1528 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, 1529 unsigned OtherScore) { 1530 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; 1531 unsigned OtherShifted = 1532 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; 1533 Score = std::max(MyShifted, OtherShifted); 1534 return OtherShifted > MyShifted; 1535 } 1536 1537 /// Merge the pending events and associater score brackets of \p Other into 1538 /// this brackets status. 1539 /// 1540 /// Returns whether the merge resulted in a change that requires tighter waits 1541 /// (i.e. the merged brackets strictly dominate the original brackets). 1542 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { 1543 bool StrictDom = false; 1544 1545 VgprUB = std::max(VgprUB, Other.VgprUB); 1546 SgprUB = std::max(SgprUB, Other.SgprUB); 1547 1548 for (auto T : inst_counter_types()) { 1549 // Merge event flags for this counter 1550 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; 1551 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; 1552 if (OtherEvents & ~OldEvents) 1553 StrictDom = true; 1554 PendingEvents |= OtherEvents; 1555 1556 // Merge scores for this counter 1557 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; 1558 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; 1559 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); 1560 if (NewUB < ScoreLBs[T]) 1561 report_fatal_error("waitcnt score overflow"); 1562 1563 MergeInfo M; 1564 M.OldLB = ScoreLBs[T]; 1565 M.OtherLB = Other.ScoreLBs[T]; 1566 M.MyShift = NewUB - ScoreUBs[T]; 1567 M.OtherShift = NewUB - Other.ScoreUBs[T]; 1568 1569 ScoreUBs[T] = NewUB; 1570 1571 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); 1572 1573 for (int J = 0; J <= VgprUB; J++) 1574 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); 1575 1576 if (T == LGKM_CNT) { 1577 for (int J = 0; J <= SgprUB; J++) 1578 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); 1579 } 1580 } 1581 1582 for (int J = 0; J <= VgprUB; J++) { 1583 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; 1584 StrictDom |= NewVmemTypes != VgprVmemTypes[J]; 1585 VgprVmemTypes[J] = NewVmemTypes; 1586 } 1587 1588 return StrictDom; 1589 } 1590 1591 static bool isWaitInstr(MachineInstr &Inst) { 1592 auto Opcode = Inst.getOpcode(); 1593 return SIInstrInfo::isWaitcnt(Opcode) || 1594 (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() && 1595 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1596 } 1597 1598 // Generate s_waitcnt instructions where needed. 1599 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 1600 MachineBasicBlock &Block, 1601 WaitcntBrackets &ScoreBrackets) { 1602 bool Modified = false; 1603 1604 LLVM_DEBUG({ 1605 dbgs() << "*** Block" << Block.getNumber() << " ***"; 1606 ScoreBrackets.dump(); 1607 }); 1608 1609 // Track the correctness of vccz through this basic block. There are two 1610 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and 1611 // ST->partialVCCWritesUpdateVCCZ(). 1612 bool VCCZCorrect = true; 1613 if (ST->hasReadVCCZBug()) { 1614 // vccz could be incorrect at a basic block boundary if a predecessor wrote 1615 // to vcc and then issued an smem load. 1616 VCCZCorrect = false; 1617 } else if (!ST->partialVCCWritesUpdateVCCZ()) { 1618 // vccz could be incorrect at a basic block boundary if a predecessor wrote 1619 // to vcc_lo or vcc_hi. 1620 VCCZCorrect = false; 1621 } 1622 1623 // Walk over the instructions. 1624 MachineInstr *OldWaitcntInstr = nullptr; 1625 1626 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), 1627 E = Block.instr_end(); 1628 Iter != E;) { 1629 MachineInstr &Inst = *Iter; 1630 1631 // Track pre-existing waitcnts that were added in earlier iterations or by 1632 // the memory legalizer. 1633 if (isWaitInstr(Inst)) { 1634 if (!OldWaitcntInstr) 1635 OldWaitcntInstr = &Inst; 1636 ++Iter; 1637 continue; 1638 } 1639 1640 bool FlushVmCnt = Block.getFirstTerminator() == Inst && 1641 isPreheaderToFlush(Block, ScoreBrackets); 1642 1643 // Generate an s_waitcnt instruction to be placed before Inst, if needed. 1644 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, 1645 FlushVmCnt); 1646 OldWaitcntInstr = nullptr; 1647 1648 // Restore vccz if it's not known to be correct already. 1649 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); 1650 1651 // Don't examine operands unless we need to track vccz correctness. 1652 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { 1653 if (Inst.definesRegister(AMDGPU::VCC_LO) || 1654 Inst.definesRegister(AMDGPU::VCC_HI)) { 1655 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. 1656 if (!ST->partialVCCWritesUpdateVCCZ()) 1657 VCCZCorrect = false; 1658 } else if (Inst.definesRegister(AMDGPU::VCC)) { 1659 // There is a hardware bug on CI/SI where SMRD instruction may corrupt 1660 // vccz bit, so when we detect that an instruction may read from a 1661 // corrupt vccz bit, we need to: 1662 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD 1663 // operations to complete. 1664 // 2. Restore the correct value of vccz by writing the current value 1665 // of vcc back to vcc. 1666 if (ST->hasReadVCCZBug() && 1667 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1668 // Writes to vcc while there's an outstanding smem read may get 1669 // clobbered as soon as any read completes. 1670 VCCZCorrect = false; 1671 } else { 1672 // Writes to vcc will fix any incorrect value in vccz. 1673 VCCZCorrect = true; 1674 } 1675 } 1676 } 1677 1678 if (TII->isSMRD(Inst)) { 1679 for (const MachineMemOperand *Memop : Inst.memoperands()) { 1680 // No need to handle invariant loads when avoiding WAR conflicts, as 1681 // there cannot be a vector store to the same memory location. 1682 if (!Memop->isInvariant()) { 1683 const Value *Ptr = Memop->getValue(); 1684 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent())); 1685 } 1686 } 1687 if (ST->hasReadVCCZBug()) { 1688 // This smem read could complete and clobber vccz at any time. 1689 VCCZCorrect = false; 1690 } 1691 } 1692 1693 updateEventWaitcntAfter(Inst, &ScoreBrackets); 1694 1695 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 1696 // If this instruction generates a S_SETVSKIP because it is an 1697 // indexed resource, and we are on Tahiti, then it will also force 1698 // an S_WAITCNT vmcnt(0) 1699 if (RequireCheckResourceType(Inst, context)) { 1700 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. 1701 ScoreBrackets->setScoreLB(VM_CNT, 1702 ScoreBrackets->getScoreUB(VM_CNT)); 1703 } 1704 #endif 1705 1706 LLVM_DEBUG({ 1707 Inst.print(dbgs()); 1708 ScoreBrackets.dump(); 1709 }); 1710 1711 // TODO: Remove this work-around after fixing the scheduler and enable the 1712 // assert above. 1713 if (RestoreVCCZ) { 1714 // Restore the vccz bit. Any time a value is written to vcc, the vcc 1715 // bit is updated, so we can restore the bit by reading the value of 1716 // vcc and then writing it back to the register. 1717 BuildMI(Block, Inst, Inst.getDebugLoc(), 1718 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 1719 TRI->getVCC()) 1720 .addReg(TRI->getVCC()); 1721 VCCZCorrect = true; 1722 Modified = true; 1723 } 1724 1725 ++Iter; 1726 } 1727 1728 if (Block.getFirstTerminator() == Block.end() && 1729 isPreheaderToFlush(Block, ScoreBrackets)) 1730 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); 1731 1732 return Modified; 1733 } 1734 1735 // Return true if the given machine basic block is a preheader of a loop in 1736 // which we want to flush the vmcnt counter, and false otherwise. 1737 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, 1738 WaitcntBrackets &ScoreBrackets) { 1739 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); 1740 if (!IsInserted) 1741 return Iterator->second; 1742 1743 MachineBasicBlock *Succ = MBB.getSingleSuccessor(); 1744 if (!Succ) 1745 return false; 1746 1747 MachineLoop *Loop = MLI->getLoopFor(Succ); 1748 if (!Loop) 1749 return false; 1750 1751 if (Loop->getLoopPreheader() == &MBB && 1752 shouldFlushVmCnt(Loop, ScoreBrackets)) { 1753 Iterator->second = true; 1754 return true; 1755 } 1756 1757 return false; 1758 } 1759 1760 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { 1761 return SIInstrInfo::isVMEM(MI) || 1762 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); 1763 } 1764 1765 // Return true if it is better to flush the vmcnt counter in the preheader of 1766 // the given loop. We currently decide to flush in two situations: 1767 // 1. The loop contains vmem store(s), no vmem load and at least one use of a 1768 // vgpr containing a value that is loaded outside of the loop. (Only on 1769 // targets with no vscnt counter). 1770 // 2. The loop contains vmem load(s), but the loaded values are not used in the 1771 // loop, and at least one use of a vgpr containing a value that is loaded 1772 // outside of the loop. 1773 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, 1774 WaitcntBrackets &Brackets) { 1775 bool HasVMemLoad = false; 1776 bool HasVMemStore = false; 1777 bool UsesVgprLoadedOutside = false; 1778 DenseSet<Register> VgprUse; 1779 DenseSet<Register> VgprDef; 1780 1781 for (MachineBasicBlock *MBB : ML->blocks()) { 1782 for (MachineInstr &MI : *MBB) { 1783 if (isVMEMOrFlatVMEM(MI)) { 1784 if (MI.mayLoad()) 1785 HasVMemLoad = true; 1786 if (MI.mayStore()) 1787 HasVMemStore = true; 1788 } 1789 for (unsigned I = 0; I < MI.getNumOperands(); I++) { 1790 MachineOperand &Op = MI.getOperand(I); 1791 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) 1792 continue; 1793 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I); 1794 // Vgpr use 1795 if (Op.isUse()) { 1796 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1797 // If we find a register that is loaded inside the loop, 1. and 2. 1798 // are invalidated and we can exit. 1799 if (VgprDef.contains(RegNo)) 1800 return false; 1801 VgprUse.insert(RegNo); 1802 // If at least one of Op's registers is in the score brackets, the 1803 // value is likely loaded outside of the loop. 1804 if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { 1805 UsesVgprLoadedOutside = true; 1806 break; 1807 } 1808 } 1809 } 1810 // VMem load vgpr def 1811 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) 1812 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1813 // If we find a register that is loaded inside the loop, 1. and 2. 1814 // are invalidated and we can exit. 1815 if (VgprUse.contains(RegNo)) 1816 return false; 1817 VgprDef.insert(RegNo); 1818 } 1819 } 1820 } 1821 } 1822 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) 1823 return true; 1824 return HasVMemLoad && UsesVgprLoadedOutside; 1825 } 1826 1827 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 1828 ST = &MF.getSubtarget<GCNSubtarget>(); 1829 TII = ST->getInstrInfo(); 1830 TRI = &TII->getRegisterInfo(); 1831 MRI = &MF.getRegInfo(); 1832 IV = AMDGPU::getIsaVersion(ST->getCPU()); 1833 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1834 MLI = &getAnalysis<MachineLoopInfo>(); 1835 PDT = &getAnalysis<MachinePostDominatorTree>(); 1836 1837 ForceEmitZeroWaitcnts = ForceEmitZeroFlag; 1838 for (auto T : inst_counter_types()) 1839 ForceEmitWaitcnt[T] = false; 1840 1841 OptNone = MF.getFunction().hasOptNone() || 1842 MF.getTarget().getOptLevel() == CodeGenOptLevel::None; 1843 1844 HardwareLimits Limits = {}; 1845 Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); 1846 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 1847 Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); 1848 Limits.VscntMax = ST->hasVscnt() ? 63 : 0; 1849 1850 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); 1851 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); 1852 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 1853 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 1854 1855 RegisterEncoding Encoding = {}; 1856 Encoding.VGPR0 = 1857 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 1858 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; 1859 Encoding.SGPR0 = 1860 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 1861 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; 1862 1863 BlockInfos.clear(); 1864 bool Modified = false; 1865 1866 if (!MFI->isEntryFunction()) { 1867 // Wait for any outstanding memory operations that the input registers may 1868 // depend on. We can't track them and it's better to do the wait after the 1869 // costly call sequence. 1870 1871 // TODO: Could insert earlier and schedule more liberally with operations 1872 // that only use caller preserved registers. 1873 MachineBasicBlock &EntryBB = MF.front(); 1874 MachineBasicBlock::iterator I = EntryBB.begin(); 1875 for (MachineBasicBlock::iterator E = EntryBB.end(); 1876 I != E && (I->isPHI() || I->isMetaInstruction()); ++I) 1877 ; 1878 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); 1879 1880 auto NonKernelInitialState = 1881 std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); 1882 NonKernelInitialState->setNonKernelFunctionInitialState(); 1883 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); 1884 1885 Modified = true; 1886 } 1887 1888 // Keep iterating over the blocks in reverse post order, inserting and 1889 // updating s_waitcnt where needed, until a fix point is reached. 1890 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) 1891 BlockInfos.insert({MBB, BlockInfo()}); 1892 1893 std::unique_ptr<WaitcntBrackets> Brackets; 1894 bool Repeat; 1895 do { 1896 Repeat = false; 1897 1898 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; 1899 ++BII) { 1900 MachineBasicBlock *MBB = BII->first; 1901 BlockInfo &BI = BII->second; 1902 if (!BI.Dirty) 1903 continue; 1904 1905 if (BI.Incoming) { 1906 if (!Brackets) 1907 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); 1908 else 1909 *Brackets = *BI.Incoming; 1910 } else { 1911 if (!Brackets) 1912 Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); 1913 else 1914 *Brackets = WaitcntBrackets(ST, Limits, Encoding); 1915 } 1916 1917 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); 1918 BI.Dirty = false; 1919 1920 if (Brackets->hasPendingEvent()) { 1921 BlockInfo *MoveBracketsToSucc = nullptr; 1922 for (MachineBasicBlock *Succ : MBB->successors()) { 1923 auto SuccBII = BlockInfos.find(Succ); 1924 BlockInfo &SuccBI = SuccBII->second; 1925 if (!SuccBI.Incoming) { 1926 SuccBI.Dirty = true; 1927 if (SuccBII <= BII) 1928 Repeat = true; 1929 if (!MoveBracketsToSucc) { 1930 MoveBracketsToSucc = &SuccBI; 1931 } else { 1932 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); 1933 } 1934 } else if (SuccBI.Incoming->merge(*Brackets)) { 1935 SuccBI.Dirty = true; 1936 if (SuccBII <= BII) 1937 Repeat = true; 1938 } 1939 } 1940 if (MoveBracketsToSucc) 1941 MoveBracketsToSucc->Incoming = std::move(Brackets); 1942 } 1943 } 1944 } while (Repeat); 1945 1946 if (ST->hasScalarStores()) { 1947 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 1948 bool HaveScalarStores = false; 1949 1950 for (MachineBasicBlock &MBB : MF) { 1951 for (MachineInstr &MI : MBB) { 1952 if (!HaveScalarStores && TII->isScalarStore(MI)) 1953 HaveScalarStores = true; 1954 1955 if (MI.getOpcode() == AMDGPU::S_ENDPGM || 1956 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 1957 EndPgmBlocks.push_back(&MBB); 1958 } 1959 } 1960 1961 if (HaveScalarStores) { 1962 // If scalar writes are used, the cache must be flushed or else the next 1963 // wave to reuse the same scratch memory can be clobbered. 1964 // 1965 // Insert s_dcache_wb at wave termination points if there were any scalar 1966 // stores, and only if the cache hasn't already been flushed. This could 1967 // be improved by looking across blocks for flushes in postdominating 1968 // blocks from the stores but an explicitly requested flush is probably 1969 // very rare. 1970 for (MachineBasicBlock *MBB : EndPgmBlocks) { 1971 bool SeenDCacheWB = false; 1972 1973 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 1974 I != E; ++I) { 1975 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 1976 SeenDCacheWB = true; 1977 else if (TII->isScalarStore(*I)) 1978 SeenDCacheWB = false; 1979 1980 // FIXME: It would be better to insert this before a waitcnt if any. 1981 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 1982 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 1983 !SeenDCacheWB) { 1984 Modified = true; 1985 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 1986 } 1987 } 1988 } 1989 } 1990 } 1991 1992 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM 1993 // instructions. 1994 for (MachineInstr *MI : ReleaseVGPRInsts) { 1995 if (ST->requiresNopBeforeDeallocVGPRs()) { 1996 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) 1997 .addImm(0); 1998 } 1999 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG)) 2000 .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); 2001 Modified = true; 2002 } 2003 ReleaseVGPRInsts.clear(); 2004 2005 return Modified; 2006 } 2007