1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert wait instructions for memory reads and writes. 11 /// 12 /// Memory reads and writes are issued asynchronously, so we need to insert 13 /// S_WAITCNT instructions when we want to access any of their results or 14 /// overwrite any register that's used asynchronously. 15 /// 16 /// TODO: This pass currently keeps one timeline per hardware counter. A more 17 /// finely-grained approach that keeps one timeline per event type could 18 /// sometimes get away with generating weaker s_waitcnt instructions. For 19 /// example, when both SMEM and LDS are in flight and we need to wait for 20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, 21 /// but the pass will currently generate a conservative lgkmcnt(0) because 22 /// multiple event types are in flight. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPU.h" 27 #include "GCNSubtarget.h" 28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "llvm/ADT/MapVector.h" 32 #include "llvm/ADT/PostOrderIterator.h" 33 #include "llvm/ADT/Sequence.h" 34 #include "llvm/Analysis/AliasAnalysis.h" 35 #include "llvm/CodeGen/MachineLoopInfo.h" 36 #include "llvm/CodeGen/MachinePostDominators.h" 37 #include "llvm/InitializePasses.h" 38 #include "llvm/Support/DebugCounter.h" 39 #include "llvm/TargetParser/TargetParser.h" 40 using namespace llvm; 41 42 #define DEBUG_TYPE "si-insert-waitcnts" 43 44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", 45 "Force emit s_waitcnt expcnt(0) instrs"); 46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", 47 "Force emit s_waitcnt lgkmcnt(0) instrs"); 48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", 49 "Force emit s_waitcnt vmcnt(0) instrs"); 50 51 static cl::opt<bool> ForceEmitZeroFlag( 52 "amdgpu-waitcnt-forcezero", 53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), 54 cl::init(false), cl::Hidden); 55 56 namespace { 57 // Class of object that encapsulates latest instruction counter score 58 // associated with the operand. Used for determining whether 59 // s_waitcnt instruction needs to be emitted. 60 61 enum InstCounterType { 62 LOAD_CNT = 0, // VMcnt prior to gfx12. 63 DS_CNT, // LKGMcnt prior to gfx12. 64 EXP_CNT, // 65 STORE_CNT, // VScnt in gfx10/gfx11. 66 NUM_NORMAL_INST_CNTS, 67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. 68 BVH_CNT, // gfx12+ only. 69 KM_CNT, // gfx12+ only. 70 NUM_EXTENDED_INST_CNTS, 71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS 72 }; 73 } // namespace 74 75 namespace llvm { 76 template <> struct enum_iteration_traits<InstCounterType> { 77 static constexpr bool is_iterable = true; 78 }; 79 } // namespace llvm 80 81 namespace { 82 // Return an iterator over all counters between LOAD_CNT (the first counter) 83 // and \c MaxCounter (exclusive, default value yields an enumeration over 84 // all counters). 85 auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { 86 return enum_seq(LOAD_CNT, MaxCounter); 87 } 88 89 using RegInterval = std::pair<int, int>; 90 91 struct HardwareLimits { 92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. 93 unsigned ExpcntMax; 94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. 95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. 96 unsigned SamplecntMax; // gfx12+ only. 97 unsigned BvhcntMax; // gfx12+ only. 98 unsigned KmcntMax; // gfx12+ only. 99 }; 100 101 struct RegisterEncoding { 102 unsigned VGPR0; 103 unsigned VGPRL; 104 unsigned SGPR0; 105 unsigned SGPRL; 106 }; 107 108 enum WaitEventType { 109 VMEM_ACCESS, // vector-memory read & write 110 VMEM_READ_ACCESS, // vector-memory read 111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only) 112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only) 113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch 114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch 115 LDS_ACCESS, // lds read & write 116 GDS_ACCESS, // gds read & write 117 SQ_MESSAGE, // send message 118 SMEM_ACCESS, // scalar-memory read & write 119 EXP_GPR_LOCK, // export holding on its data src 120 GDS_GPR_LOCK, // GDS holding on its data and addr src 121 EXP_POS_ACCESS, // write to export position 122 EXP_PARAM_ACCESS, // write to export parameter 123 VMW_GPR_LOCK, // vector-memory write holding on its data src 124 EXP_LDS_ACCESS, // read by ldsdir counting as export 125 NUM_WAIT_EVENTS, 126 }; 127 128 // The mapping is: 129 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 130 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 131 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 132 // We reserve a fixed number of VGPR slots in the scoring tables for 133 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 134 enum RegisterMapping { 135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. 136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. 137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. 139 // Artificial register slots to track LDS writes into specific LDS locations 140 // if a location is known. When slots are exhausted or location is 141 // unknown use the first slot. The first slot is also always updated in 142 // addition to known location's slot to properly generate waits if dependent 143 // instruction's location is unknown. 144 EXTRA_VGPR_LDS = 0, 145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 146 }; 147 148 // Enumerate different types of result-returning VMEM operations. Although 149 // s_waitcnt orders them all with a single vmcnt counter, in the absence of 150 // s_waitcnt only instructions of the same VmemType are guaranteed to write 151 // their results in order -- so there is no need to insert an s_waitcnt between 152 // two instructions of the same type that write the same vgpr. 153 enum VmemType { 154 // BUF instructions and MIMG instructions without a sampler. 155 VMEM_NOSAMPLER, 156 // MIMG instructions with a sampler. 157 VMEM_SAMPLER, 158 // BVH instructions 159 VMEM_BVH, 160 NUM_VMEM_TYPES 161 }; 162 163 // Maps values of InstCounterType to the instruction that waits on that 164 // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() 165 // returns true. 166 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { 167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, 168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, 169 AMDGPU::S_WAIT_KMCNT}; 170 171 static bool updateVMCntOnly(const MachineInstr &Inst) { 172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || 173 SIInstrInfo::isFLATScratch(Inst); 174 } 175 176 #ifndef NDEBUG 177 static bool isNormalMode(InstCounterType MaxCounter) { 178 return MaxCounter == NUM_NORMAL_INST_CNTS; 179 } 180 #endif // NDEBUG 181 182 VmemType getVmemType(const MachineInstr &Inst) { 183 assert(updateVMCntOnly(Inst)); 184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) && 185 !SIInstrInfo::isVSAMPLE(Inst)) 186 return VMEM_NOSAMPLER; 187 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); 188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = 189 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 190 return BaseInfo->BVH ? VMEM_BVH 191 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; 192 } 193 194 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { 195 switch (T) { 196 case LOAD_CNT: 197 return Wait.LoadCnt; 198 case EXP_CNT: 199 return Wait.ExpCnt; 200 case DS_CNT: 201 return Wait.DsCnt; 202 case STORE_CNT: 203 return Wait.StoreCnt; 204 case SAMPLE_CNT: 205 return Wait.SampleCnt; 206 case BVH_CNT: 207 return Wait.BvhCnt; 208 case KM_CNT: 209 return Wait.KmCnt; 210 default: 211 llvm_unreachable("bad InstCounterType"); 212 } 213 } 214 215 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { 216 unsigned &WC = getCounterRef(Wait, T); 217 WC = std::min(WC, Count); 218 } 219 220 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { 221 getCounterRef(Wait, T) = ~0u; 222 } 223 224 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { 225 return getCounterRef(Wait, T); 226 } 227 228 // Mapping from event to counter according to the table masks. 229 InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { 230 for (auto T : inst_counter_types()) { 231 if (masks[T] & (1 << E)) 232 return T; 233 } 234 llvm_unreachable("event type has no associated counter"); 235 } 236 237 // This objects maintains the current score brackets of each wait counter, and 238 // a per-register scoreboard for each wait counter. 239 // 240 // We also maintain the latest score for every event type that can change the 241 // waitcnt in order to know if there are multiple types of events within 242 // the brackets. When multiple types of event happen in the bracket, 243 // wait count may get decreased out of order, therefore we need to put in 244 // "s_waitcnt 0" before use. 245 class WaitcntBrackets { 246 public: 247 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, 248 HardwareLimits Limits, RegisterEncoding Encoding, 249 const unsigned *WaitEventMaskForInst, 250 InstCounterType SmemAccessCounter) 251 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), 252 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst), 253 SmemAccessCounter(SmemAccessCounter) {} 254 255 unsigned getWaitCountMax(InstCounterType T) const { 256 switch (T) { 257 case LOAD_CNT: 258 return Limits.LoadcntMax; 259 case DS_CNT: 260 return Limits.DscntMax; 261 case EXP_CNT: 262 return Limits.ExpcntMax; 263 case STORE_CNT: 264 return Limits.StorecntMax; 265 case SAMPLE_CNT: 266 return Limits.SamplecntMax; 267 case BVH_CNT: 268 return Limits.BvhcntMax; 269 case KM_CNT: 270 return Limits.KmcntMax; 271 default: 272 break; 273 } 274 return 0; 275 } 276 277 unsigned getScoreLB(InstCounterType T) const { 278 assert(T < NUM_INST_CNTS); 279 return ScoreLBs[T]; 280 } 281 282 unsigned getScoreUB(InstCounterType T) const { 283 assert(T < NUM_INST_CNTS); 284 return ScoreUBs[T]; 285 } 286 287 unsigned getScoreRange(InstCounterType T) const { 288 return getScoreUB(T) - getScoreLB(T); 289 } 290 291 unsigned getRegScore(int GprNo, InstCounterType T) const { 292 if (GprNo < NUM_ALL_VGPRS) { 293 return VgprScores[T][GprNo]; 294 } 295 assert(T == SmemAccessCounter); 296 return SgprScores[GprNo - NUM_ALL_VGPRS]; 297 } 298 299 bool merge(const WaitcntBrackets &Other); 300 301 RegInterval getRegInterval(const MachineInstr *MI, 302 const MachineRegisterInfo *MRI, 303 const SIRegisterInfo *TRI, unsigned OpNo) const; 304 305 bool counterOutOfOrder(InstCounterType T) const; 306 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; 307 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; 308 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; 309 void applyWaitcnt(const AMDGPU::Waitcnt &Wait); 310 void applyWaitcnt(InstCounterType T, unsigned Count); 311 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 312 const MachineRegisterInfo *MRI, WaitEventType E, 313 MachineInstr &MI); 314 315 unsigned hasPendingEvent() const { return PendingEvents; } 316 unsigned hasPendingEvent(WaitEventType E) const { 317 return PendingEvents & (1 << E); 318 } 319 unsigned hasPendingEvent(InstCounterType T) const { 320 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; 321 assert((HasPending != 0) == (getScoreRange(T) != 0)); 322 return HasPending; 323 } 324 325 bool hasMixedPendingEvents(InstCounterType T) const { 326 unsigned Events = hasPendingEvent(T); 327 // Return true if more than one bit is set in Events. 328 return Events & (Events - 1); 329 } 330 331 bool hasPendingFlat() const { 332 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && 333 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || 334 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && 335 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); 336 } 337 338 void setPendingFlat() { 339 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; 340 LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; 341 } 342 343 // Return true if there might be pending writes to the specified vgpr by VMEM 344 // instructions with types different from V. 345 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { 346 assert(GprNo < NUM_ALL_VGPRS); 347 return VgprVmemTypes[GprNo] & ~(1 << V); 348 } 349 350 void clearVgprVmemTypes(int GprNo) { 351 assert(GprNo < NUM_ALL_VGPRS); 352 VgprVmemTypes[GprNo] = 0; 353 } 354 355 void setStateOnFunctionEntryOrReturn() { 356 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); 357 PendingEvents |= WaitEventMaskForInst[STORE_CNT]; 358 } 359 360 ArrayRef<const MachineInstr *> getLDSDMAStores() const { 361 return LDSDMAStores; 362 } 363 364 void print(raw_ostream &); 365 void dump() { print(dbgs()); } 366 367 private: 368 struct MergeInfo { 369 unsigned OldLB; 370 unsigned OtherLB; 371 unsigned MyShift; 372 unsigned OtherShift; 373 }; 374 static bool mergeScore(const MergeInfo &M, unsigned &Score, 375 unsigned OtherScore); 376 377 void setScoreLB(InstCounterType T, unsigned Val) { 378 assert(T < NUM_INST_CNTS); 379 ScoreLBs[T] = Val; 380 } 381 382 void setScoreUB(InstCounterType T, unsigned Val) { 383 assert(T < NUM_INST_CNTS); 384 ScoreUBs[T] = Val; 385 386 if (T != EXP_CNT) 387 return; 388 389 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) 390 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); 391 } 392 393 void setRegScore(int GprNo, InstCounterType T, unsigned Val) { 394 if (GprNo < NUM_ALL_VGPRS) { 395 VgprUB = std::max(VgprUB, GprNo); 396 VgprScores[T][GprNo] = Val; 397 } else { 398 assert(T == SmemAccessCounter); 399 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); 400 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 401 } 402 } 403 404 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 405 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 406 unsigned OpNo, unsigned Val); 407 408 const GCNSubtarget *ST = nullptr; 409 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; 410 HardwareLimits Limits = {}; 411 RegisterEncoding Encoding = {}; 412 const unsigned *WaitEventMaskForInst; 413 InstCounterType SmemAccessCounter; 414 unsigned ScoreLBs[NUM_INST_CNTS] = {0}; 415 unsigned ScoreUBs[NUM_INST_CNTS] = {0}; 416 unsigned PendingEvents = 0; 417 // Remember the last flat memory operation. 418 unsigned LastFlat[NUM_INST_CNTS] = {0}; 419 // wait_cnt scores for every vgpr. 420 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 421 int VgprUB = -1; 422 int SgprUB = -1; 423 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; 424 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt 425 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. 426 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 427 // Bitmask of the VmemTypes of VMEM instructions that might have a pending 428 // write to each vgpr. 429 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; 430 // Store representative LDS DMA operations. The only useful info here is 431 // alias info. One store is kept per unique AAInfo. 432 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores; 433 }; 434 435 // This abstracts the logic for generating and updating S_WAIT* instructions 436 // away from the analysis that determines where they are needed. This was 437 // done because the set of counters and instructions for waiting on them 438 // underwent a major shift with gfx12, sufficiently so that having this 439 // abstraction allows the main analysis logic to be simpler than it would 440 // otherwise have had to become. 441 class WaitcntGenerator { 442 protected: 443 const GCNSubtarget *ST = nullptr; 444 const SIInstrInfo *TII = nullptr; 445 AMDGPU::IsaVersion IV; 446 InstCounterType MaxCounter; 447 448 public: 449 WaitcntGenerator() {} 450 WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter) 451 : ST(ST), TII(ST->getInstrInfo()), 452 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {} 453 454 // Edits an existing sequence of wait count instructions according 455 // to an incoming Waitcnt value, which is itself updated to reflect 456 // any new wait count instructions which may need to be generated by 457 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits 458 // were made. 459 // 460 // This editing will usually be merely updated operands, but it may also 461 // delete instructions if the incoming Wait value indicates they are not 462 // needed. It may also remove existing instructions for which a wait 463 // is needed if it can be determined that it is better to generate new 464 // instructions later, as can happen on gfx12. 465 virtual bool 466 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 467 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 468 MachineBasicBlock::instr_iterator It) const = 0; 469 470 // Transform a soft waitcnt into a normal one. 471 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; 472 473 // Generates new wait count instructions according to the value of 474 // Wait, returning true if any new instructions were created. 475 virtual bool createNewWaitcnt(MachineBasicBlock &Block, 476 MachineBasicBlock::instr_iterator It, 477 AMDGPU::Waitcnt Wait) = 0; 478 479 // Returns an array of bit masks which can be used to map values in 480 // WaitEventType to corresponding counter values in InstCounterType. 481 virtual const unsigned *getWaitEventMask() const = 0; 482 483 virtual ~WaitcntGenerator() = default; 484 }; 485 486 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { 487 public: 488 WaitcntGeneratorPreGFX12() {} 489 WaitcntGeneratorPreGFX12(const GCNSubtarget *ST) 490 : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {} 491 492 bool 493 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 494 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 495 MachineBasicBlock::instr_iterator It) const override; 496 497 bool createNewWaitcnt(MachineBasicBlock &Block, 498 MachineBasicBlock::instr_iterator It, 499 AMDGPU::Waitcnt Wait) override; 500 501 const unsigned *getWaitEventMask() const override { 502 assert(ST); 503 504 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { 505 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) | 506 (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS), 507 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | 508 (1 << SQ_MESSAGE), 509 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | 510 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | 511 (1 << EXP_LDS_ACCESS), 512 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS), 513 0, 514 0, 515 0}; 516 517 return WaitEventMaskForInstPreGFX12; 518 } 519 }; 520 521 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { 522 public: 523 WaitcntGeneratorGFX12Plus() {} 524 WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter) 525 : WaitcntGenerator(ST, MaxCounter) {} 526 527 bool 528 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 529 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 530 MachineBasicBlock::instr_iterator It) const override; 531 532 bool createNewWaitcnt(MachineBasicBlock &Block, 533 MachineBasicBlock::instr_iterator It, 534 AMDGPU::Waitcnt Wait) override; 535 536 const unsigned *getWaitEventMask() const override { 537 assert(ST); 538 539 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { 540 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), 541 (1 << LDS_ACCESS) | (1 << GDS_ACCESS), 542 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | 543 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | 544 (1 << EXP_LDS_ACCESS), 545 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS), 546 (1 << VMEM_SAMPLER_READ_ACCESS), 547 (1 << VMEM_BVH_READ_ACCESS), 548 (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)}; 549 550 return WaitEventMaskForInstGFX12Plus; 551 } 552 }; 553 554 class SIInsertWaitcnts : public MachineFunctionPass { 555 private: 556 const GCNSubtarget *ST = nullptr; 557 const SIInstrInfo *TII = nullptr; 558 const SIRegisterInfo *TRI = nullptr; 559 const MachineRegisterInfo *MRI = nullptr; 560 561 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; 562 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; 563 MachineLoopInfo *MLI; 564 MachinePostDominatorTree *PDT; 565 AliasAnalysis *AA = nullptr; 566 567 struct BlockInfo { 568 std::unique_ptr<WaitcntBrackets> Incoming; 569 bool Dirty = true; 570 }; 571 572 InstCounterType SmemAccessCounter; 573 574 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; 575 576 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 577 // because of amdgpu-waitcnt-forcezero flag 578 bool ForceEmitZeroWaitcnts; 579 bool ForceEmitWaitcnt[NUM_INST_CNTS]; 580 581 bool OptNone; 582 583 // In any given run of this pass, WCG will point to one of these two 584 // generator objects, which must have been re-initialised before use 585 // from a value made using a subtarget constructor. 586 WaitcntGeneratorPreGFX12 WCGPreGFX12; 587 WaitcntGeneratorGFX12Plus WCGGFX12Plus; 588 589 WaitcntGenerator *WCG = nullptr; 590 591 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS 592 // message. 593 DenseSet<MachineInstr *> ReleaseVGPRInsts; 594 595 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; 596 597 public: 598 static char ID; 599 600 SIInsertWaitcnts() : MachineFunctionPass(ID) { 601 (void)ForceExpCounter; 602 (void)ForceLgkmCounter; 603 (void)ForceVMCounter; 604 } 605 606 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); 607 bool isPreheaderToFlush(MachineBasicBlock &MBB, 608 WaitcntBrackets &ScoreBrackets); 609 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; 610 bool runOnMachineFunction(MachineFunction &MF) override; 611 612 StringRef getPassName() const override { 613 return "SI insert wait instructions"; 614 } 615 616 void getAnalysisUsage(AnalysisUsage &AU) const override { 617 AU.setPreservesCFG(); 618 AU.addRequired<MachineLoopInfo>(); 619 AU.addRequired<MachinePostDominatorTree>(); 620 AU.addUsedIfAvailable<AAResultsWrapperPass>(); 621 AU.addPreserved<AAResultsWrapperPass>(); 622 MachineFunctionPass::getAnalysisUsage(AU); 623 } 624 625 bool isForceEmitWaitcnt() const { 626 for (auto T : inst_counter_types()) 627 if (ForceEmitWaitcnt[T]) 628 return true; 629 return false; 630 } 631 632 void setForceEmitWaitcnt() { 633 // For non-debug builds, ForceEmitWaitcnt has been initialized to false; 634 // For debug builds, get the debug counter info and adjust if need be 635 #ifndef NDEBUG 636 if (DebugCounter::isCounterSet(ForceExpCounter) && 637 DebugCounter::shouldExecute(ForceExpCounter)) { 638 ForceEmitWaitcnt[EXP_CNT] = true; 639 } else { 640 ForceEmitWaitcnt[EXP_CNT] = false; 641 } 642 643 if (DebugCounter::isCounterSet(ForceLgkmCounter) && 644 DebugCounter::shouldExecute(ForceLgkmCounter)) { 645 ForceEmitWaitcnt[DS_CNT] = true; 646 ForceEmitWaitcnt[KM_CNT] = true; 647 } else { 648 ForceEmitWaitcnt[DS_CNT] = false; 649 ForceEmitWaitcnt[KM_CNT] = false; 650 } 651 652 if (DebugCounter::isCounterSet(ForceVMCounter) && 653 DebugCounter::shouldExecute(ForceVMCounter)) { 654 ForceEmitWaitcnt[LOAD_CNT] = true; 655 ForceEmitWaitcnt[SAMPLE_CNT] = true; 656 ForceEmitWaitcnt[BVH_CNT] = true; 657 } else { 658 ForceEmitWaitcnt[LOAD_CNT] = false; 659 ForceEmitWaitcnt[SAMPLE_CNT] = false; 660 ForceEmitWaitcnt[BVH_CNT] = false; 661 } 662 #endif // NDEBUG 663 } 664 665 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or 666 // FLAT instruction. 667 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { 668 // Maps VMEM access types to their corresponding WaitEventType. 669 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { 670 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; 671 672 assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); 673 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side 674 // these should use VM_CNT. 675 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) 676 return VMEM_ACCESS; 677 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) { 678 // FLAT and SCRATCH instructions may access scratch. Other VMEM 679 // instructions do not. 680 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) 681 return SCRATCH_WRITE_ACCESS; 682 return VMEM_WRITE_ACCESS; 683 } 684 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) 685 return VMEM_READ_ACCESS; 686 return VmemReadMapping[getVmemType(Inst)]; 687 } 688 689 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; 690 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 691 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; 692 bool generateWaitcntInstBefore(MachineInstr &MI, 693 WaitcntBrackets &ScoreBrackets, 694 MachineInstr *OldWaitcntInstr, 695 bool FlushVmCnt); 696 bool generateWaitcntBlockEnd(MachineBasicBlock &Block, 697 WaitcntBrackets &ScoreBrackets, 698 MachineInstr *OldWaitcntInstr); 699 bool generateWaitcnt(AMDGPU::Waitcnt Wait, 700 MachineBasicBlock::instr_iterator It, 701 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, 702 MachineInstr *OldWaitcntInstr); 703 void updateEventWaitcntAfter(MachineInstr &Inst, 704 WaitcntBrackets *ScoreBrackets); 705 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, 706 WaitcntBrackets &ScoreBrackets); 707 }; 708 709 } // end anonymous namespace 710 711 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, 712 const MachineRegisterInfo *MRI, 713 const SIRegisterInfo *TRI, 714 unsigned OpNo) const { 715 const MachineOperand &Op = MI->getOperand(OpNo); 716 if (!TRI->isInAllocatableClass(Op.getReg())) 717 return {-1, -1}; 718 719 // A use via a PW operand does not need a waitcnt. 720 // A partial write is not a WAW. 721 assert(!Op.getSubReg() || !Op.isUndef()); 722 723 RegInterval Result; 724 725 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & 726 AMDGPU::HWEncoding::REG_IDX_MASK; 727 728 if (TRI->isVectorRegister(*MRI, Op.getReg())) { 729 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); 730 Result.first = Reg - Encoding.VGPR0; 731 if (TRI->isAGPR(*MRI, Op.getReg())) 732 Result.first += AGPR_OFFSET; 733 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 734 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { 735 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 736 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; 737 assert(Result.first >= NUM_ALL_VGPRS && 738 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 739 } 740 // TODO: Handle TTMP 741 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... 742 else 743 return {-1, -1}; 744 745 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); 746 unsigned Size = TRI->getRegSizeInBits(*RC); 747 Result.second = Result.first + ((Size + 16) / 32); 748 749 return Result; 750 } 751 752 void WaitcntBrackets::setExpScore(const MachineInstr *MI, 753 const SIInstrInfo *TII, 754 const SIRegisterInfo *TRI, 755 const MachineRegisterInfo *MRI, unsigned OpNo, 756 unsigned Val) { 757 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo); 758 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); 759 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 760 setRegScore(RegNo, EXP_CNT, Val); 761 } 762 } 763 764 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 765 const SIRegisterInfo *TRI, 766 const MachineRegisterInfo *MRI, 767 WaitEventType E, MachineInstr &Inst) { 768 InstCounterType T = eventCounter(WaitEventMaskForInst, E); 769 770 unsigned UB = getScoreUB(T); 771 unsigned CurrScore = UB + 1; 772 if (CurrScore == 0) 773 report_fatal_error("InsertWaitcnt score wraparound"); 774 // PendingEvents and ScoreUB need to be update regardless if this event 775 // changes the score of a register or not. 776 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 777 PendingEvents |= 1 << E; 778 setScoreUB(T, CurrScore); 779 780 if (T == EXP_CNT) { 781 // Put score on the source vgprs. If this is a store, just use those 782 // specific register(s). 783 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 784 int AddrOpIdx = 785 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); 786 // All GDS operations must protect their address register (same as 787 // export.) 788 if (AddrOpIdx != -1) { 789 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); 790 } 791 792 if (Inst.mayStore()) { 793 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) { 794 setExpScore( 795 &Inst, TII, TRI, MRI, 796 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 797 CurrScore); 798 } 799 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) { 800 setExpScore(&Inst, TII, TRI, MRI, 801 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 802 AMDGPU::OpName::data1), 803 CurrScore); 804 } 805 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && 806 Inst.getOpcode() != AMDGPU::DS_APPEND && 807 Inst.getOpcode() != AMDGPU::DS_CONSUME && 808 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 809 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 810 const MachineOperand &Op = Inst.getOperand(I); 811 if (Op.isReg() && !Op.isDef() && 812 TRI->isVectorRegister(*MRI, Op.getReg())) { 813 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 814 } 815 } 816 } 817 } else if (TII->isFLAT(Inst)) { 818 if (Inst.mayStore()) { 819 setExpScore( 820 &Inst, TII, TRI, MRI, 821 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 822 CurrScore); 823 } else if (SIInstrInfo::isAtomicRet(Inst)) { 824 setExpScore( 825 &Inst, TII, TRI, MRI, 826 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 827 CurrScore); 828 } 829 } else if (TII->isMIMG(Inst)) { 830 if (Inst.mayStore()) { 831 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 832 } else if (SIInstrInfo::isAtomicRet(Inst)) { 833 setExpScore( 834 &Inst, TII, TRI, MRI, 835 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 836 CurrScore); 837 } 838 } else if (TII->isMTBUF(Inst)) { 839 if (Inst.mayStore()) { 840 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 841 } 842 } else if (TII->isMUBUF(Inst)) { 843 if (Inst.mayStore()) { 844 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 845 } else if (SIInstrInfo::isAtomicRet(Inst)) { 846 setExpScore( 847 &Inst, TII, TRI, MRI, 848 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 849 CurrScore); 850 } 851 } else if (TII->isLDSDIR(Inst)) { 852 // LDSDIR instructions attach the score to the destination. 853 setExpScore( 854 &Inst, TII, TRI, MRI, 855 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), 856 CurrScore); 857 } else { 858 if (TII->isEXP(Inst)) { 859 // For export the destination registers are really temps that 860 // can be used as the actual source after export patching, so 861 // we need to treat them like sources and set the EXP_CNT 862 // score. 863 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 864 MachineOperand &DefMO = Inst.getOperand(I); 865 if (DefMO.isReg() && DefMO.isDef() && 866 TRI->isVGPR(*MRI, DefMO.getReg())) { 867 setRegScore( 868 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), 869 EXP_CNT, CurrScore); 870 } 871 } 872 } 873 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 874 MachineOperand &MO = Inst.getOperand(I); 875 if (MO.isReg() && !MO.isDef() && 876 TRI->isVectorRegister(*MRI, MO.getReg())) { 877 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 878 } 879 } 880 } 881 #if 0 // TODO: check if this is handled by MUBUF code above. 882 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || 883 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || 884 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { 885 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); 886 unsigned OpNo;//TODO: find the OpNo for this operand; 887 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo); 888 for (int RegNo = Interval.first; RegNo < Interval.second; 889 ++RegNo) { 890 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); 891 } 892 #endif 893 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { 894 // Match the score to the destination registers. 895 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 896 auto &Op = Inst.getOperand(I); 897 if (!Op.isReg() || !Op.isDef()) 898 continue; 899 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I); 900 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { 901 if (Interval.first >= NUM_ALL_VGPRS) 902 continue; 903 if (updateVMCntOnly(Inst)) { 904 // updateVMCntOnly should only leave us with VGPRs 905 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR 906 // defs. That's required for a sane index into `VgprMemTypes` below 907 assert(TRI->isVectorRegister(*MRI, Op.getReg())); 908 VmemType V = getVmemType(Inst); 909 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) 910 VgprVmemTypes[RegNo] |= 1 << V; 911 } 912 } 913 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 914 setRegScore(RegNo, T, CurrScore); 915 } 916 } 917 if (Inst.mayStore() && 918 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { 919 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS 920 // written can be accessed. A load from LDS to VMEM does not need a wait. 921 unsigned Slot = 0; 922 for (const auto *MemOp : Inst.memoperands()) { 923 if (!MemOp->isStore() || 924 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) 925 continue; 926 // Comparing just AA info does not guarantee memoperands are equal 927 // in general, but this is so for LDS DMA in practice. 928 auto AAI = MemOp->getAAInfo(); 929 // Alias scope information gives a way to definitely identify an 930 // original memory object and practically produced in the module LDS 931 // lowering pass. If there is no scope available we will not be able 932 // to disambiguate LDS aliasing as after the module lowering all LDS 933 // is squashed into a single big object. Do not attempt to use one of 934 // the limited LDSDMAStores for something we will not be able to use 935 // anyway. 936 if (!AAI || !AAI.Scope) 937 break; 938 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { 939 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { 940 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { 941 Slot = I + 1; 942 break; 943 } 944 } 945 } 946 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) 947 break; 948 LDSDMAStores.push_back(&Inst); 949 Slot = LDSDMAStores.size(); 950 break; 951 } 952 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore); 953 if (Slot) 954 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 955 } 956 } 957 } 958 959 void WaitcntBrackets::print(raw_ostream &OS) { 960 OS << '\n'; 961 for (auto T : inst_counter_types(MaxCounter)) { 962 unsigned SR = getScoreRange(T); 963 964 switch (T) { 965 case LOAD_CNT: 966 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" 967 << SR << "): "; 968 break; 969 case DS_CNT: 970 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" 971 << SR << "): "; 972 break; 973 case EXP_CNT: 974 OS << " EXP_CNT(" << SR << "): "; 975 break; 976 case STORE_CNT: 977 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" 978 << SR << "): "; 979 break; 980 case SAMPLE_CNT: 981 OS << " SAMPLE_CNT(" << SR << "): "; 982 break; 983 case BVH_CNT: 984 OS << " BVH_CNT(" << SR << "): "; 985 break; 986 case KM_CNT: 987 OS << " KM_CNT(" << SR << "): "; 988 break; 989 default: 990 OS << " UNKNOWN(" << SR << "): "; 991 break; 992 } 993 994 if (SR != 0) { 995 // Print vgpr scores. 996 unsigned LB = getScoreLB(T); 997 998 for (int J = 0; J <= VgprUB; J++) { 999 unsigned RegScore = getRegScore(J, T); 1000 if (RegScore <= LB) 1001 continue; 1002 unsigned RelScore = RegScore - LB - 1; 1003 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 1004 OS << RelScore << ":v" << J << " "; 1005 } else { 1006 OS << RelScore << ":ds "; 1007 } 1008 } 1009 // Also need to print sgpr scores for lgkm_cnt. 1010 if (T == SmemAccessCounter) { 1011 for (int J = 0; J <= SgprUB; J++) { 1012 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); 1013 if (RegScore <= LB) 1014 continue; 1015 unsigned RelScore = RegScore - LB - 1; 1016 OS << RelScore << ":s" << J << " "; 1017 } 1018 } 1019 } 1020 OS << '\n'; 1021 } 1022 OS << '\n'; 1023 } 1024 1025 /// Simplify the waitcnt, in the sense of removing redundant counts, and return 1026 /// whether a waitcnt instruction is needed at all. 1027 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { 1028 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1029 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); 1030 simplifyWaitcnt(DS_CNT, Wait.DsCnt); 1031 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); 1032 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); 1033 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); 1034 simplifyWaitcnt(KM_CNT, Wait.KmCnt); 1035 } 1036 1037 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, 1038 unsigned &Count) const { 1039 // The number of outstanding events for this type, T, can be calculated 1040 // as (UB - LB). If the current Count is greater than or equal to the number 1041 // of outstanding events, then the wait for this counter is redundant. 1042 if (Count >= getScoreRange(T)) 1043 Count = ~0u; 1044 } 1045 1046 void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, 1047 AMDGPU::Waitcnt &Wait) const { 1048 unsigned ScoreToWait = getRegScore(RegNo, T); 1049 1050 // If the score of src_operand falls within the bracket, we need an 1051 // s_waitcnt instruction. 1052 const unsigned LB = getScoreLB(T); 1053 const unsigned UB = getScoreUB(T); 1054 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 1055 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && 1056 !ST->hasFlatLgkmVMemCountInOrder()) { 1057 // If there is a pending FLAT operation, and this is a VMem or LGKM 1058 // waitcnt and the target can report early completion, then we need 1059 // to force a waitcnt 0. 1060 addWait(Wait, T, 0); 1061 } else if (counterOutOfOrder(T)) { 1062 // Counter can get decremented out-of-order when there 1063 // are multiple types event in the bracket. Also emit an s_wait counter 1064 // with a conservative value of 0 for the counter. 1065 addWait(Wait, T, 0); 1066 } else { 1067 // If a counter has been maxed out avoid overflow by waiting for 1068 // MAX(CounterType) - 1 instead. 1069 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); 1070 addWait(Wait, T, NeededWait); 1071 } 1072 } 1073 } 1074 1075 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { 1076 applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1077 applyWaitcnt(EXP_CNT, Wait.ExpCnt); 1078 applyWaitcnt(DS_CNT, Wait.DsCnt); 1079 applyWaitcnt(STORE_CNT, Wait.StoreCnt); 1080 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); 1081 applyWaitcnt(BVH_CNT, Wait.BvhCnt); 1082 applyWaitcnt(KM_CNT, Wait.KmCnt); 1083 } 1084 1085 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { 1086 const unsigned UB = getScoreUB(T); 1087 if (Count >= UB) 1088 return; 1089 if (Count != 0) { 1090 if (counterOutOfOrder(T)) 1091 return; 1092 setScoreLB(T, std::max(getScoreLB(T), UB - Count)); 1093 } else { 1094 setScoreLB(T, UB); 1095 PendingEvents &= ~WaitEventMaskForInst[T]; 1096 } 1097 } 1098 1099 // Where there are multiple types of event in the bracket of a counter, 1100 // the decrement may go out of order. 1101 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { 1102 // Scalar memory read always can go out of order. 1103 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) 1104 return true; 1105 return hasMixedPendingEvents(T); 1106 } 1107 1108 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 1109 false) 1110 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 1111 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 1112 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 1113 false) 1114 1115 char SIInsertWaitcnts::ID = 0; 1116 1117 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 1118 1119 FunctionPass *llvm::createSIInsertWaitcntsPass() { 1120 return new SIInsertWaitcnts(); 1121 } 1122 1123 static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, 1124 unsigned NewEnc) { 1125 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 1126 assert(OpIdx >= 0); 1127 1128 MachineOperand &MO = MI.getOperand(OpIdx); 1129 1130 if (NewEnc == MO.getImm()) 1131 return false; 1132 1133 MO.setImm(NewEnc); 1134 return true; 1135 } 1136 1137 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, 1138 /// and if so, which counter it is waiting on. 1139 static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { 1140 switch (Opcode) { 1141 case AMDGPU::S_WAIT_LOADCNT: 1142 return LOAD_CNT; 1143 case AMDGPU::S_WAIT_EXPCNT: 1144 return EXP_CNT; 1145 case AMDGPU::S_WAIT_STORECNT: 1146 return STORE_CNT; 1147 case AMDGPU::S_WAIT_SAMPLECNT: 1148 return SAMPLE_CNT; 1149 case AMDGPU::S_WAIT_BVHCNT: 1150 return BVH_CNT; 1151 case AMDGPU::S_WAIT_DSCNT: 1152 return DS_CNT; 1153 case AMDGPU::S_WAIT_KMCNT: 1154 return KM_CNT; 1155 default: 1156 return {}; 1157 } 1158 } 1159 1160 bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { 1161 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode()); 1162 if (Opcode == Waitcnt->getOpcode()) 1163 return false; 1164 1165 Waitcnt->setDesc(TII->get(Opcode)); 1166 return true; 1167 } 1168 1169 /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that 1170 /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits 1171 /// from \p Wait that were added by previous passes. Currently this pass 1172 /// conservatively assumes that these preexisting waits are required for 1173 /// correctness. 1174 bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( 1175 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 1176 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 1177 assert(ST); 1178 assert(isNormalMode(MaxCounter)); 1179 1180 bool Modified = false; 1181 MachineInstr *WaitcntInstr = nullptr; 1182 MachineInstr *WaitcntVsCntInstr = nullptr; 1183 1184 for (auto &II : 1185 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 1186 if (II.isMetaInstruction()) 1187 continue; 1188 1189 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); 1190 bool IsSoft = Opcode != II.getOpcode(); 1191 1192 // Update required wait count. If this is a soft waitcnt (= it was added 1193 // by an earlier pass), it may be entirely removed. 1194 if (Opcode == AMDGPU::S_WAITCNT) { 1195 unsigned IEnc = II.getOperand(0).getImm(); 1196 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); 1197 if (IsSoft) 1198 ScoreBrackets.simplifyWaitcnt(OldWait); 1199 Wait = Wait.combined(OldWait); 1200 1201 // Merge consecutive waitcnt of the same type by erasing multiples. 1202 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) { 1203 II.eraseFromParent(); 1204 Modified = true; 1205 } else 1206 WaitcntInstr = &II; 1207 } else { 1208 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); 1209 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1210 1211 unsigned OldVSCnt = 1212 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1213 if (IsSoft) 1214 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); 1215 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); 1216 1217 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) { 1218 II.eraseFromParent(); 1219 Modified = true; 1220 } else 1221 WaitcntVsCntInstr = &II; 1222 } 1223 } 1224 1225 if (WaitcntInstr) { 1226 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, 1227 AMDGPU::encodeWaitcnt(IV, Wait)); 1228 Modified |= promoteSoftWaitCnt(WaitcntInstr); 1229 1230 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1231 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); 1232 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 1233 Wait.LoadCnt = ~0u; 1234 Wait.ExpCnt = ~0u; 1235 Wait.DsCnt = ~0u; 1236 1237 LLVM_DEBUG(It == WaitcntInstr->getParent()->end() 1238 ? dbgs() 1239 << "applyPreexistingWaitcnt\n" 1240 << "New Instr at block end: " << *WaitcntInstr << '\n' 1241 : dbgs() << "applyPreexistingWaitcnt\n" 1242 << "Old Instr: " << *It 1243 << "New Instr: " << *WaitcntInstr << '\n'); 1244 } 1245 1246 if (WaitcntVsCntInstr) { 1247 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, 1248 AMDGPU::OpName::simm16, Wait.StoreCnt); 1249 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); 1250 1251 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); 1252 Wait.StoreCnt = ~0u; 1253 1254 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() 1255 ? dbgs() << "applyPreexistingWaitcnt\n" 1256 << "New Instr at block end: " << *WaitcntVsCntInstr 1257 << '\n' 1258 : dbgs() << "applyPreexistingWaitcnt\n" 1259 << "Old Instr: " << *It 1260 << "New Instr: " << *WaitcntVsCntInstr << '\n'); 1261 } 1262 1263 return Modified; 1264 } 1265 1266 /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any 1267 /// required counters in \p Wait 1268 bool WaitcntGeneratorPreGFX12::createNewWaitcnt( 1269 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, 1270 AMDGPU::Waitcnt Wait) { 1271 assert(ST); 1272 assert(isNormalMode(MaxCounter)); 1273 1274 bool Modified = false; 1275 const DebugLoc &DL = Block.findDebugLoc(It); 1276 1277 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a 1278 // single instruction while VScnt has its own instruction. 1279 if (Wait.hasWaitExceptStoreCnt()) { 1280 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); 1281 [[maybe_unused]] auto SWaitInst = 1282 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); 1283 Modified = true; 1284 1285 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1286 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1287 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1288 } 1289 1290 if (Wait.hasWaitStoreCnt()) { 1291 assert(ST->hasVscnt()); 1292 1293 [[maybe_unused]] auto SWaitInst = 1294 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1295 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1296 .addImm(Wait.StoreCnt); 1297 Modified = true; 1298 1299 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1300 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1301 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1302 } 1303 1304 return Modified; 1305 } 1306 1307 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and 1308 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that 1309 /// were added by previous passes. Currently this pass conservatively 1310 /// assumes that these preexisting waits are required for correctness. 1311 bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( 1312 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 1313 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 1314 assert(ST); 1315 assert(!isNormalMode(MaxCounter)); 1316 1317 bool Modified = false; 1318 MachineInstr *CombinedLoadDsCntInstr = nullptr; 1319 MachineInstr *CombinedStoreDsCntInstr = nullptr; 1320 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; 1321 1322 for (auto &II : 1323 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 1324 if (II.isMetaInstruction()) 1325 continue; 1326 1327 MachineInstr **UpdatableInstr; 1328 1329 // Update required wait count. If this is a soft waitcnt (= it was added 1330 // by an earlier pass), it may be entirely removed. 1331 1332 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); 1333 bool IsSoft = Opcode != II.getOpcode(); 1334 1335 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { 1336 unsigned OldEnc = 1337 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1338 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); 1339 if (IsSoft) 1340 ScoreBrackets.simplifyWaitcnt(OldWait); 1341 Wait = Wait.combined(OldWait); 1342 UpdatableInstr = &CombinedLoadDsCntInstr; 1343 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { 1344 unsigned OldEnc = 1345 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1346 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); 1347 if (IsSoft) 1348 ScoreBrackets.simplifyWaitcnt(OldWait); 1349 Wait = Wait.combined(OldWait); 1350 UpdatableInstr = &CombinedStoreDsCntInstr; 1351 } else { 1352 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); 1353 assert(CT.has_value()); 1354 unsigned OldCnt = 1355 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1356 if (IsSoft) 1357 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); 1358 addWait(Wait, CT.value(), OldCnt); 1359 UpdatableInstr = &WaitInstrs[CT.value()]; 1360 } 1361 1362 // Merge consecutive waitcnt of the same type by erasing multiples. 1363 if (!*UpdatableInstr) { 1364 *UpdatableInstr = &II; 1365 } else { 1366 II.eraseFromParent(); 1367 Modified = true; 1368 } 1369 } 1370 1371 if (CombinedLoadDsCntInstr) { 1372 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need 1373 // to be waited for. Otherwise, let the instruction be deleted so 1374 // the appropriate single counter wait instruction can be inserted 1375 // instead, when new S_WAIT_*CNT instructions are inserted by 1376 // createNewWaitcnt(). As a side effect, resetting the wait counts will 1377 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by 1378 // the loop below that deals with single counter instructions. 1379 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { 1380 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); 1381 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, 1382 AMDGPU::OpName::simm16, NewEnc); 1383 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr); 1384 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1385 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 1386 Wait.LoadCnt = ~0u; 1387 Wait.DsCnt = ~0u; 1388 1389 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 1390 ? dbgs() << "applyPreexistingWaitcnt\n" 1391 << "New Instr at block end: " 1392 << *CombinedLoadDsCntInstr << '\n' 1393 : dbgs() << "applyPreexistingWaitcnt\n" 1394 << "Old Instr: " << *It << "New Instr: " 1395 << *CombinedLoadDsCntInstr << '\n'); 1396 } else { 1397 CombinedLoadDsCntInstr->eraseFromParent(); 1398 Modified = true; 1399 } 1400 } 1401 1402 if (CombinedStoreDsCntInstr) { 1403 // Similarly for S_WAIT_STORECNT_DSCNT. 1404 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { 1405 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait); 1406 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr, 1407 AMDGPU::OpName::simm16, NewEnc); 1408 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr); 1409 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); 1410 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 1411 Wait.StoreCnt = ~0u; 1412 Wait.DsCnt = ~0u; 1413 1414 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 1415 ? dbgs() << "applyPreexistingWaitcnt\n" 1416 << "New Instr at block end: " 1417 << *CombinedStoreDsCntInstr << '\n' 1418 : dbgs() << "applyPreexistingWaitcnt\n" 1419 << "Old Instr: " << *It << "New Instr: " 1420 << *CombinedStoreDsCntInstr << '\n'); 1421 } else { 1422 CombinedStoreDsCntInstr->eraseFromParent(); 1423 Modified = true; 1424 } 1425 } 1426 1427 // Look for an opportunity to convert existing S_WAIT_LOADCNT, 1428 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT 1429 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing 1430 // instructions so that createNewWaitcnt() will create new combined 1431 // instructions to replace them. 1432 1433 if (Wait.DsCnt != ~0u) { 1434 // This is a vector of addresses in WaitInstrs pointing to instructions 1435 // that should be removed if they are present. 1436 SmallVector<MachineInstr **, 2> WaitsToErase; 1437 1438 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not 1439 // both) need to be waited for, ensure that there are no existing 1440 // individual wait count instructions for these. 1441 1442 if (Wait.LoadCnt != ~0u) { 1443 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]); 1444 WaitsToErase.push_back(&WaitInstrs[DS_CNT]); 1445 } else if (Wait.StoreCnt != ~0u) { 1446 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]); 1447 WaitsToErase.push_back(&WaitInstrs[DS_CNT]); 1448 } 1449 1450 for (MachineInstr **WI : WaitsToErase) { 1451 if (!*WI) 1452 continue; 1453 1454 (*WI)->eraseFromParent(); 1455 *WI = nullptr; 1456 Modified = true; 1457 } 1458 } 1459 1460 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 1461 if (!WaitInstrs[CT]) 1462 continue; 1463 1464 unsigned NewCnt = getWait(Wait, CT); 1465 if (NewCnt != ~0u) { 1466 Modified |= updateOperandIfDifferent(*WaitInstrs[CT], 1467 AMDGPU::OpName::simm16, NewCnt); 1468 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]); 1469 1470 ScoreBrackets.applyWaitcnt(CT, NewCnt); 1471 setNoWait(Wait, CT); 1472 1473 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 1474 ? dbgs() << "applyPreexistingWaitcnt\n" 1475 << "New Instr at block end: " << *WaitInstrs[CT] 1476 << '\n' 1477 : dbgs() << "applyPreexistingWaitcnt\n" 1478 << "Old Instr: " << *It 1479 << "New Instr: " << *WaitInstrs[CT] << '\n'); 1480 } else { 1481 WaitInstrs[CT]->eraseFromParent(); 1482 Modified = true; 1483 } 1484 } 1485 1486 return Modified; 1487 } 1488 1489 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait 1490 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( 1491 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, 1492 AMDGPU::Waitcnt Wait) { 1493 assert(ST); 1494 assert(!isNormalMode(MaxCounter)); 1495 1496 bool Modified = false; 1497 const DebugLoc &DL = Block.findDebugLoc(It); 1498 1499 // Check for opportunities to use combined wait instructions. 1500 if (Wait.DsCnt != ~0u) { 1501 MachineInstr *SWaitInst = nullptr; 1502 1503 if (Wait.LoadCnt != ~0u) { 1504 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); 1505 1506 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) 1507 .addImm(Enc); 1508 1509 Wait.LoadCnt = ~0u; 1510 Wait.DsCnt = ~0u; 1511 } else if (Wait.StoreCnt != ~0u) { 1512 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); 1513 1514 SWaitInst = 1515 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) 1516 .addImm(Enc); 1517 1518 Wait.StoreCnt = ~0u; 1519 Wait.DsCnt = ~0u; 1520 } 1521 1522 if (SWaitInst) { 1523 Modified = true; 1524 1525 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1526 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1527 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1528 } 1529 } 1530 1531 // Generate an instruction for any remaining counter that needs 1532 // waiting for. 1533 1534 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 1535 unsigned Count = getWait(Wait, CT); 1536 if (Count == ~0u) 1537 continue; 1538 1539 [[maybe_unused]] auto SWaitInst = 1540 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) 1541 .addImm(Count); 1542 1543 Modified = true; 1544 1545 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1546 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1547 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1548 } 1549 1550 return Modified; 1551 } 1552 1553 static bool readsVCCZ(const MachineInstr &MI) { 1554 unsigned Opc = MI.getOpcode(); 1555 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 1556 !MI.getOperand(1).isUndef(); 1557 } 1558 1559 /// \returns true if the callee inserts an s_waitcnt 0 on function entry. 1560 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { 1561 // Currently all conventions wait, but this may not always be the case. 1562 // 1563 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make 1564 // senses to omit the wait and do it in the caller. 1565 return true; 1566 } 1567 1568 /// \returns true if the callee is expected to wait for any outstanding waits 1569 /// before returning. 1570 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { 1571 return true; 1572 } 1573 1574 /// Generate s_waitcnt instruction to be placed before cur_Inst. 1575 /// Instructions of a given type are returned in order, 1576 /// but instructions of different types can complete out of order. 1577 /// We rely on this in-order completion 1578 /// and simply assign a score to the memory access instructions. 1579 /// We keep track of the active "score bracket" to determine 1580 /// if an access of a memory read requires an s_waitcnt 1581 /// and if so what the value of each counter is. 1582 /// The "score bracket" is bound by the lower bound and upper bound 1583 /// scores (*_score_LB and *_score_ub respectively). 1584 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to 1585 /// flush the vmcnt counter here. 1586 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, 1587 WaitcntBrackets &ScoreBrackets, 1588 MachineInstr *OldWaitcntInstr, 1589 bool FlushVmCnt) { 1590 setForceEmitWaitcnt(); 1591 1592 if (MI.isMetaInstruction()) 1593 return false; 1594 1595 AMDGPU::Waitcnt Wait; 1596 1597 // FIXME: This should have already been handled by the memory legalizer. 1598 // Removing this currently doesn't affect any lit tests, but we need to 1599 // verify that nothing was relying on this. The number of buffer invalidates 1600 // being handled here should not be expanded. 1601 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 1602 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 1603 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || 1604 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || 1605 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { 1606 Wait.LoadCnt = 0; 1607 } 1608 1609 // All waits must be resolved at call return. 1610 // NOTE: this could be improved with knowledge of all call sites or 1611 // with knowledge of the called routines. 1612 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1613 MI.getOpcode() == AMDGPU::SI_RETURN || 1614 MI.getOpcode() == AMDGPU::S_SETPC_B64_return || 1615 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { 1616 Wait = Wait.combined( 1617 AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); 1618 } 1619 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM 1620 // stores. In this case it can be useful to send a message to explicitly 1621 // release all VGPRs before the stores have completed, but it is only safe to 1622 // do this if: 1623 // * there are no outstanding scratch stores 1624 // * we are not in Dynamic VGPR mode 1625 else if (MI.getOpcode() == AMDGPU::S_ENDPGM || 1626 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { 1627 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && 1628 ScoreBrackets.getScoreRange(STORE_CNT) != 0 && 1629 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) 1630 ReleaseVGPRInsts.insert(&MI); 1631 } 1632 // Resolve vm waits before gs-done. 1633 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 1634 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 1635 ST->hasLegacyGeometry() && 1636 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == 1637 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { 1638 Wait.LoadCnt = 0; 1639 } 1640 #if 0 // TODO: the following blocks of logic when we have fence. 1641 else if (MI.getOpcode() == SC_FENCE) { 1642 const unsigned int group_size = 1643 context->shader_info->GetMaxThreadGroupSize(); 1644 // group_size == 0 means thread group size is unknown at compile time 1645 const bool group_is_multi_wave = 1646 (group_size == 0 || group_size > target_info->GetWaveFrontSize()); 1647 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); 1648 1649 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { 1650 SCRegType src_type = Inst->GetSrcType(i); 1651 switch (src_type) { 1652 case SCMEM_LDS: 1653 if (group_is_multi_wave || 1654 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { 1655 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, 1656 ScoreBrackets->getScoreUB(DS_CNT)); 1657 // LDS may have to wait for VMcnt after buffer load to LDS 1658 if (target_info->HasBufferLoadToLDS()) { 1659 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, 1660 ScoreBrackets->getScoreUB(LOAD_CNT)); 1661 } 1662 } 1663 break; 1664 1665 case SCMEM_GDS: 1666 if (group_is_multi_wave || fence_is_global) { 1667 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 1668 ScoreBrackets->getScoreUB(EXP_CNT)); 1669 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, 1670 ScoreBrackets->getScoreUB(DS_CNT)); 1671 } 1672 break; 1673 1674 case SCMEM_UAV: 1675 case SCMEM_TFBUF: 1676 case SCMEM_RING: 1677 case SCMEM_SCATTER: 1678 if (group_is_multi_wave || fence_is_global) { 1679 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 1680 ScoreBrackets->getScoreUB(EXP_CNT)); 1681 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, 1682 ScoreBrackets->getScoreUB(LOAD_CNT)); 1683 } 1684 break; 1685 1686 case SCMEM_SCRATCH: 1687 default: 1688 break; 1689 } 1690 } 1691 } 1692 #endif 1693 1694 // Export & GDS instructions do not read the EXEC mask until after the export 1695 // is granted (which can occur well after the instruction is issued). 1696 // The shader program must flush all EXP operations on the export-count 1697 // before overwriting the EXEC mask. 1698 else { 1699 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 1700 // Export and GDS are tracked individually, either may trigger a waitcnt 1701 // for EXEC. 1702 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || 1703 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || 1704 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || 1705 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { 1706 Wait.ExpCnt = 0; 1707 } 1708 } 1709 1710 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { 1711 // The function is going to insert a wait on everything in its prolog. 1712 // This still needs to be careful if the call target is a load (e.g. a GOT 1713 // load). We also need to check WAW dependency with saved PC. 1714 Wait = AMDGPU::Waitcnt(); 1715 1716 int CallAddrOpIdx = 1717 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 1718 1719 if (MI.getOperand(CallAddrOpIdx).isReg()) { 1720 RegInterval CallAddrOpInterval = 1721 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx); 1722 1723 for (int RegNo = CallAddrOpInterval.first; 1724 RegNo < CallAddrOpInterval.second; ++RegNo) 1725 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); 1726 1727 int RtnAddrOpIdx = 1728 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 1729 if (RtnAddrOpIdx != -1) { 1730 RegInterval RtnAddrOpInterval = 1731 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx); 1732 1733 for (int RegNo = RtnAddrOpInterval.first; 1734 RegNo < RtnAddrOpInterval.second; ++RegNo) 1735 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); 1736 } 1737 } 1738 } else { 1739 // FIXME: Should not be relying on memoperands. 1740 // Look at the source operands of every instruction to see if 1741 // any of them results from a previous memory operation that affects 1742 // its current usage. If so, an s_waitcnt instruction needs to be 1743 // emitted. 1744 // If the source operand was defined by a load, add the s_waitcnt 1745 // instruction. 1746 // 1747 // Two cases are handled for destination operands: 1748 // 1) If the destination operand was defined by a load, add the s_waitcnt 1749 // instruction to guarantee the right WAW order. 1750 // 2) If a destination operand that was used by a recent export/store ins, 1751 // add s_waitcnt on exp_cnt to guarantee the WAR order. 1752 1753 for (const MachineMemOperand *Memop : MI.memoperands()) { 1754 const Value *Ptr = Memop->getValue(); 1755 if (Memop->isStore() && SLoadAddresses.count(Ptr)) { 1756 addWait(Wait, SmemAccessCounter, 0); 1757 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) 1758 SLoadAddresses.erase(Ptr); 1759 } 1760 unsigned AS = Memop->getAddrSpace(); 1761 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) 1762 continue; 1763 // No need to wait before load from VMEM to LDS. 1764 if (TII->mayWriteLDSThroughDMA(MI)) 1765 continue; 1766 1767 // LOAD_CNT is only relevant to vgpr or LDS. 1768 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 1769 bool FoundAliasingStore = false; 1770 // Only objects with alias scope info were added to LDSDMAScopes array. 1771 // In the absense of the scope info we will not be able to disambiguate 1772 // aliasing here. There is no need to try searching for a corresponding 1773 // store slot. This is conservatively correct because in that case we 1774 // will produce a wait using the first (general) LDS DMA wait slot which 1775 // will wait on all of them anyway. 1776 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { 1777 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); 1778 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { 1779 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { 1780 FoundAliasingStore = true; 1781 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); 1782 } 1783 } 1784 } 1785 if (!FoundAliasingStore) 1786 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); 1787 if (Memop->isStore()) { 1788 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1789 } 1790 } 1791 1792 // Loop over use and def operands. 1793 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 1794 MachineOperand &Op = MI.getOperand(I); 1795 if (!Op.isReg()) 1796 continue; 1797 1798 // If the instruction does not read tied source, skip the operand. 1799 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) 1800 continue; 1801 1802 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I); 1803 1804 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); 1805 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1806 if (IsVGPR) { 1807 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the 1808 // previous write and this write are the same type of VMEM 1809 // instruction, in which case they're guaranteed to write their 1810 // results in order anyway. 1811 if (Op.isUse() || !updateVMCntOnly(MI) || 1812 ScoreBrackets.hasOtherPendingVmemTypes(RegNo, 1813 getVmemType(MI))) { 1814 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); 1815 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); 1816 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); 1817 ScoreBrackets.clearVgprVmemTypes(RegNo); 1818 } 1819 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { 1820 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1821 } 1822 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait); 1823 } else { 1824 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); 1825 } 1826 } 1827 } 1828 } 1829 } 1830 1831 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does 1832 // not, we need to ensure the subtarget is capable of backing off barrier 1833 // instructions in case there are any outstanding memory operations that may 1834 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. 1835 if (TII->isBarrierStart(MI.getOpcode()) && 1836 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { 1837 Wait = Wait.combined( 1838 AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); 1839 } 1840 1841 // TODO: Remove this work-around, enable the assert for Bug 457939 1842 // after fixing the scheduler. Also, the Shader Compiler code is 1843 // independent of target. 1844 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { 1845 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1846 Wait.DsCnt = 0; 1847 } 1848 } 1849 1850 // Verify that the wait is actually needed. 1851 ScoreBrackets.simplifyWaitcnt(Wait); 1852 1853 if (ForceEmitZeroWaitcnts) 1854 Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); 1855 1856 if (ForceEmitWaitcnt[LOAD_CNT]) 1857 Wait.LoadCnt = 0; 1858 if (ForceEmitWaitcnt[EXP_CNT]) 1859 Wait.ExpCnt = 0; 1860 if (ForceEmitWaitcnt[DS_CNT]) 1861 Wait.DsCnt = 0; 1862 if (ForceEmitWaitcnt[SAMPLE_CNT]) 1863 Wait.SampleCnt = 0; 1864 if (ForceEmitWaitcnt[BVH_CNT]) 1865 Wait.BvhCnt = 0; 1866 if (ForceEmitWaitcnt[KM_CNT]) 1867 Wait.KmCnt = 0; 1868 1869 if (FlushVmCnt) { 1870 if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) 1871 Wait.LoadCnt = 0; 1872 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) 1873 Wait.SampleCnt = 0; 1874 if (ScoreBrackets.hasPendingEvent(BVH_CNT)) 1875 Wait.BvhCnt = 0; 1876 } 1877 1878 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, 1879 OldWaitcntInstr); 1880 } 1881 1882 // Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the 1883 // end of the given block if needed. 1884 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, 1885 WaitcntBrackets &ScoreBrackets, 1886 MachineInstr *OldWaitcntInstr) { 1887 AMDGPU::Waitcnt Wait; 1888 1889 unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT); 1890 unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT); 1891 unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT); 1892 1893 if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0) 1894 return false; 1895 1896 if (LoadCntPending != 0) 1897 Wait.LoadCnt = 0; 1898 if (SampleCntPending != 0) 1899 Wait.SampleCnt = 0; 1900 if (BvhCntPending != 0) 1901 Wait.BvhCnt = 0; 1902 1903 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, 1904 OldWaitcntInstr); 1905 } 1906 1907 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, 1908 MachineBasicBlock::instr_iterator It, 1909 MachineBasicBlock &Block, 1910 WaitcntBrackets &ScoreBrackets, 1911 MachineInstr *OldWaitcntInstr) { 1912 bool Modified = false; 1913 1914 if (OldWaitcntInstr) 1915 // Try to merge the required wait with preexisting waitcnt instructions. 1916 // Also erase redundant waitcnt. 1917 Modified = 1918 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); 1919 1920 // Any counts that could have been applied to any existing waitcnt 1921 // instructions will have been done so, now deal with any remaining. 1922 ScoreBrackets.applyWaitcnt(Wait); 1923 1924 // ExpCnt can be merged into VINTERP. 1925 if (Wait.ExpCnt != ~0u && It != Block.instr_end() && 1926 SIInstrInfo::isVINTERP(*It)) { 1927 MachineOperand *WaitExp = 1928 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); 1929 if (Wait.ExpCnt < WaitExp->getImm()) { 1930 WaitExp->setImm(Wait.ExpCnt); 1931 Modified = true; 1932 } 1933 Wait.ExpCnt = ~0u; 1934 1935 LLVM_DEBUG(dbgs() << "generateWaitcnt\n" 1936 << "Update Instr: " << *It); 1937 } 1938 1939 if (WCG->createNewWaitcnt(Block, It, Wait)) 1940 Modified = true; 1941 1942 return Modified; 1943 } 1944 1945 // This is a flat memory operation. Check to see if it has memory tokens other 1946 // than LDS. Other address spaces supported by flat memory operations involve 1947 // global memory. 1948 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { 1949 assert(TII->isFLAT(MI)); 1950 1951 // All flat instructions use the VMEM counter. 1952 assert(TII->usesVM_CNT(MI)); 1953 1954 // If there are no memory operands then conservatively assume the flat 1955 // operation may access VMEM. 1956 if (MI.memoperands_empty()) 1957 return true; 1958 1959 // See if any memory operand specifies an address space that involves VMEM. 1960 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces 1961 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION 1962 // (GDS) address space is not supported by flat operations. Therefore, simply 1963 // return true unless only the LDS address space is found. 1964 for (const MachineMemOperand *Memop : MI.memoperands()) { 1965 unsigned AS = Memop->getAddrSpace(); 1966 assert(AS != AMDGPUAS::REGION_ADDRESS); 1967 if (AS != AMDGPUAS::LOCAL_ADDRESS) 1968 return true; 1969 } 1970 1971 return false; 1972 } 1973 1974 // This is a flat memory operation. Check to see if it has memory tokens for 1975 // either LDS or FLAT. 1976 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 1977 assert(TII->isFLAT(MI)); 1978 1979 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. 1980 if (!TII->usesLGKM_CNT(MI)) 1981 return false; 1982 1983 // If in tgsplit mode then there can be no use of LDS. 1984 if (ST->isTgSplitEnabled()) 1985 return false; 1986 1987 // If there are no memory operands then conservatively assume the flat 1988 // operation may access LDS. 1989 if (MI.memoperands_empty()) 1990 return true; 1991 1992 // See if any memory operand specifies an address space that involves LDS. 1993 for (const MachineMemOperand *Memop : MI.memoperands()) { 1994 unsigned AS = Memop->getAddrSpace(); 1995 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1996 return true; 1997 } 1998 1999 return false; 2000 } 2001 2002 // This is a flat memory operation. Check to see if it has memory tokens for 2003 // either scratch or FLAT. 2004 bool SIInsertWaitcnts::mayAccessScratchThroughFlat( 2005 const MachineInstr &MI) const { 2006 assert(TII->isFLAT(MI)); 2007 2008 // SCRATCH instructions always access scratch. 2009 if (TII->isFLATScratch(MI)) 2010 return true; 2011 2012 // GLOBAL instructions never access scratch. 2013 if (TII->isFLATGlobal(MI)) 2014 return false; 2015 2016 // If there are no memory operands then conservatively assume the flat 2017 // operation may access scratch. 2018 if (MI.memoperands_empty()) 2019 return true; 2020 2021 // See if any memory operand specifies an address space that involves scratch. 2022 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { 2023 unsigned AS = Memop->getAddrSpace(); 2024 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 2025 }); 2026 } 2027 2028 static bool isCacheInvOrWBInst(MachineInstr &Inst) { 2029 auto Opc = Inst.getOpcode(); 2030 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || 2031 Opc == AMDGPU::GLOBAL_WBINV; 2032 } 2033 2034 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, 2035 WaitcntBrackets *ScoreBrackets) { 2036 // Now look at the instruction opcode. If it is a memory access 2037 // instruction, update the upper-bound of the appropriate counter's 2038 // bracket and the destination operand scores. 2039 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. 2040 2041 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 2042 if (TII->isAlwaysGDS(Inst.getOpcode()) || 2043 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 2044 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 2045 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 2046 } else { 2047 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 2048 } 2049 } else if (TII->isFLAT(Inst)) { 2050 // TODO: Track this properly. 2051 if (isCacheInvOrWBInst(Inst)) 2052 return; 2053 2054 assert(Inst.mayLoadOrStore()); 2055 2056 int FlatASCount = 0; 2057 2058 if (mayAccessVMEMThroughFlat(Inst)) { 2059 ++FlatASCount; 2060 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2061 Inst); 2062 } 2063 2064 if (mayAccessLDSThroughFlat(Inst)) { 2065 ++FlatASCount; 2066 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 2067 } 2068 2069 // A Flat memory operation must access at least one address space. 2070 assert(FlatASCount); 2071 2072 // This is a flat memory operation that access both VMEM and LDS, so note it 2073 // - it will require that both the VM and LGKM be flushed to zero if it is 2074 // pending when a VM or LGKM dependency occurs. 2075 if (FlatASCount > 1) 2076 ScoreBrackets->setPendingFlat(); 2077 } else if (SIInstrInfo::isVMEM(Inst) && 2078 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { 2079 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2080 Inst); 2081 2082 if (ST->vmemWriteNeedsExpWaitcnt() && 2083 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { 2084 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 2085 } 2086 } else if (TII->isSMRD(Inst)) { 2087 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 2088 } else if (Inst.isCall()) { 2089 if (callWaitsOnFunctionReturn(Inst)) { 2090 // Act as a wait on everything 2091 ScoreBrackets->applyWaitcnt( 2092 AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); 2093 ScoreBrackets->setStateOnFunctionEntryOrReturn(); 2094 } else { 2095 // May need to way wait for anything. 2096 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); 2097 } 2098 } else if (SIInstrInfo::isLDSDIR(Inst)) { 2099 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); 2100 } else if (TII->isVINTERP(Inst)) { 2101 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); 2102 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); 2103 } else if (SIInstrInfo::isEXP(Inst)) { 2104 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 2105 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) 2106 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 2107 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) 2108 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 2109 else 2110 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 2111 } else { 2112 switch (Inst.getOpcode()) { 2113 case AMDGPU::S_SENDMSG: 2114 case AMDGPU::S_SENDMSG_RTN_B32: 2115 case AMDGPU::S_SENDMSG_RTN_B64: 2116 case AMDGPU::S_SENDMSGHALT: 2117 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 2118 break; 2119 case AMDGPU::S_MEMTIME: 2120 case AMDGPU::S_MEMREALTIME: 2121 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: 2122 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: 2123 case AMDGPU::S_BARRIER_LEAVE: 2124 case AMDGPU::S_GET_BARRIER_STATE_M0: 2125 case AMDGPU::S_GET_BARRIER_STATE_IMM: 2126 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 2127 break; 2128 } 2129 } 2130 } 2131 2132 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, 2133 unsigned OtherScore) { 2134 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; 2135 unsigned OtherShifted = 2136 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; 2137 Score = std::max(MyShifted, OtherShifted); 2138 return OtherShifted > MyShifted; 2139 } 2140 2141 /// Merge the pending events and associater score brackets of \p Other into 2142 /// this brackets status. 2143 /// 2144 /// Returns whether the merge resulted in a change that requires tighter waits 2145 /// (i.e. the merged brackets strictly dominate the original brackets). 2146 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { 2147 bool StrictDom = false; 2148 2149 VgprUB = std::max(VgprUB, Other.VgprUB); 2150 SgprUB = std::max(SgprUB, Other.SgprUB); 2151 2152 for (auto T : inst_counter_types(MaxCounter)) { 2153 // Merge event flags for this counter 2154 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; 2155 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; 2156 if (OtherEvents & ~OldEvents) 2157 StrictDom = true; 2158 PendingEvents |= OtherEvents; 2159 2160 // Merge scores for this counter 2161 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; 2162 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; 2163 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); 2164 if (NewUB < ScoreLBs[T]) 2165 report_fatal_error("waitcnt score overflow"); 2166 2167 MergeInfo M; 2168 M.OldLB = ScoreLBs[T]; 2169 M.OtherLB = Other.ScoreLBs[T]; 2170 M.MyShift = NewUB - ScoreUBs[T]; 2171 M.OtherShift = NewUB - Other.ScoreUBs[T]; 2172 2173 ScoreUBs[T] = NewUB; 2174 2175 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); 2176 2177 for (int J = 0; J <= VgprUB; J++) 2178 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); 2179 2180 if (T == SmemAccessCounter) { 2181 for (int J = 0; J <= SgprUB; J++) 2182 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); 2183 } 2184 } 2185 2186 for (int J = 0; J <= VgprUB; J++) { 2187 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; 2188 StrictDom |= NewVmemTypes != VgprVmemTypes[J]; 2189 VgprVmemTypes[J] = NewVmemTypes; 2190 } 2191 2192 return StrictDom; 2193 } 2194 2195 static bool isWaitInstr(MachineInstr &Inst) { 2196 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode()); 2197 return Opcode == AMDGPU::S_WAITCNT || 2198 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && 2199 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || 2200 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || 2201 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || 2202 counterTypeForInstr(Opcode).has_value(); 2203 } 2204 2205 // Generate s_waitcnt instructions where needed. 2206 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 2207 MachineBasicBlock &Block, 2208 WaitcntBrackets &ScoreBrackets) { 2209 bool Modified = false; 2210 2211 LLVM_DEBUG({ 2212 dbgs() << "*** Block" << Block.getNumber() << " ***"; 2213 ScoreBrackets.dump(); 2214 }); 2215 2216 // Track the correctness of vccz through this basic block. There are two 2217 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and 2218 // ST->partialVCCWritesUpdateVCCZ(). 2219 bool VCCZCorrect = true; 2220 if (ST->hasReadVCCZBug()) { 2221 // vccz could be incorrect at a basic block boundary if a predecessor wrote 2222 // to vcc and then issued an smem load. 2223 VCCZCorrect = false; 2224 } else if (!ST->partialVCCWritesUpdateVCCZ()) { 2225 // vccz could be incorrect at a basic block boundary if a predecessor wrote 2226 // to vcc_lo or vcc_hi. 2227 VCCZCorrect = false; 2228 } 2229 2230 // Walk over the instructions. 2231 MachineInstr *OldWaitcntInstr = nullptr; 2232 2233 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), 2234 E = Block.instr_end(); 2235 Iter != E;) { 2236 MachineInstr &Inst = *Iter; 2237 2238 // Track pre-existing waitcnts that were added in earlier iterations or by 2239 // the memory legalizer. 2240 if (isWaitInstr(Inst)) { 2241 if (!OldWaitcntInstr) 2242 OldWaitcntInstr = &Inst; 2243 ++Iter; 2244 continue; 2245 } 2246 2247 bool FlushVmCnt = Block.getFirstTerminator() == Inst && 2248 isPreheaderToFlush(Block, ScoreBrackets); 2249 2250 // Generate an s_waitcnt instruction to be placed before Inst, if needed. 2251 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, 2252 FlushVmCnt); 2253 OldWaitcntInstr = nullptr; 2254 2255 // Restore vccz if it's not known to be correct already. 2256 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); 2257 2258 // Don't examine operands unless we need to track vccz correctness. 2259 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { 2260 if (Inst.definesRegister(AMDGPU::VCC_LO) || 2261 Inst.definesRegister(AMDGPU::VCC_HI)) { 2262 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. 2263 if (!ST->partialVCCWritesUpdateVCCZ()) 2264 VCCZCorrect = false; 2265 } else if (Inst.definesRegister(AMDGPU::VCC)) { 2266 // There is a hardware bug on CI/SI where SMRD instruction may corrupt 2267 // vccz bit, so when we detect that an instruction may read from a 2268 // corrupt vccz bit, we need to: 2269 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD 2270 // operations to complete. 2271 // 2. Restore the correct value of vccz by writing the current value 2272 // of vcc back to vcc. 2273 if (ST->hasReadVCCZBug() && 2274 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 2275 // Writes to vcc while there's an outstanding smem read may get 2276 // clobbered as soon as any read completes. 2277 VCCZCorrect = false; 2278 } else { 2279 // Writes to vcc will fix any incorrect value in vccz. 2280 VCCZCorrect = true; 2281 } 2282 } 2283 } 2284 2285 if (TII->isSMRD(Inst)) { 2286 for (const MachineMemOperand *Memop : Inst.memoperands()) { 2287 // No need to handle invariant loads when avoiding WAR conflicts, as 2288 // there cannot be a vector store to the same memory location. 2289 if (!Memop->isInvariant()) { 2290 const Value *Ptr = Memop->getValue(); 2291 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent())); 2292 } 2293 } 2294 if (ST->hasReadVCCZBug()) { 2295 // This smem read could complete and clobber vccz at any time. 2296 VCCZCorrect = false; 2297 } 2298 } 2299 2300 updateEventWaitcntAfter(Inst, &ScoreBrackets); 2301 2302 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 2303 // If this instruction generates a S_SETVSKIP because it is an 2304 // indexed resource, and we are on Tahiti, then it will also force 2305 // an S_WAITCNT vmcnt(0) 2306 if (RequireCheckResourceType(Inst, context)) { 2307 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. 2308 ScoreBrackets->setScoreLB(LOAD_CNT, 2309 ScoreBrackets->getScoreUB(LOAD_CNT)); 2310 } 2311 #endif 2312 2313 LLVM_DEBUG({ 2314 Inst.print(dbgs()); 2315 ScoreBrackets.dump(); 2316 }); 2317 2318 // TODO: Remove this work-around after fixing the scheduler and enable the 2319 // assert above. 2320 if (RestoreVCCZ) { 2321 // Restore the vccz bit. Any time a value is written to vcc, the vcc 2322 // bit is updated, so we can restore the bit by reading the value of 2323 // vcc and then writing it back to the register. 2324 BuildMI(Block, Inst, Inst.getDebugLoc(), 2325 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 2326 TRI->getVCC()) 2327 .addReg(TRI->getVCC()); 2328 VCCZCorrect = true; 2329 Modified = true; 2330 } 2331 2332 ++Iter; 2333 } 2334 2335 if (Block.getFirstTerminator() == Block.end() && 2336 isPreheaderToFlush(Block, ScoreBrackets)) 2337 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); 2338 2339 return Modified; 2340 } 2341 2342 // Return true if the given machine basic block is a preheader of a loop in 2343 // which we want to flush the vmcnt counter, and false otherwise. 2344 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, 2345 WaitcntBrackets &ScoreBrackets) { 2346 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); 2347 if (!IsInserted) 2348 return Iterator->second; 2349 2350 MachineBasicBlock *Succ = MBB.getSingleSuccessor(); 2351 if (!Succ) 2352 return false; 2353 2354 MachineLoop *Loop = MLI->getLoopFor(Succ); 2355 if (!Loop) 2356 return false; 2357 2358 if (Loop->getLoopPreheader() == &MBB && 2359 shouldFlushVmCnt(Loop, ScoreBrackets)) { 2360 Iterator->second = true; 2361 return true; 2362 } 2363 2364 return false; 2365 } 2366 2367 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { 2368 return SIInstrInfo::isVMEM(MI) || 2369 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); 2370 } 2371 2372 // Return true if it is better to flush the vmcnt counter in the preheader of 2373 // the given loop. We currently decide to flush in two situations: 2374 // 1. The loop contains vmem store(s), no vmem load and at least one use of a 2375 // vgpr containing a value that is loaded outside of the loop. (Only on 2376 // targets with no vscnt counter). 2377 // 2. The loop contains vmem load(s), but the loaded values are not used in the 2378 // loop, and at least one use of a vgpr containing a value that is loaded 2379 // outside of the loop. 2380 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, 2381 WaitcntBrackets &Brackets) { 2382 bool HasVMemLoad = false; 2383 bool HasVMemStore = false; 2384 bool UsesVgprLoadedOutside = false; 2385 DenseSet<Register> VgprUse; 2386 DenseSet<Register> VgprDef; 2387 2388 for (MachineBasicBlock *MBB : ML->blocks()) { 2389 for (MachineInstr &MI : *MBB) { 2390 if (isVMEMOrFlatVMEM(MI)) { 2391 if (MI.mayLoad()) 2392 HasVMemLoad = true; 2393 if (MI.mayStore()) 2394 HasVMemStore = true; 2395 } 2396 for (unsigned I = 0; I < MI.getNumOperands(); I++) { 2397 MachineOperand &Op = MI.getOperand(I); 2398 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) 2399 continue; 2400 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I); 2401 // Vgpr use 2402 if (Op.isUse()) { 2403 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 2404 // If we find a register that is loaded inside the loop, 1. and 2. 2405 // are invalidated and we can exit. 2406 if (VgprDef.contains(RegNo)) 2407 return false; 2408 VgprUse.insert(RegNo); 2409 // If at least one of Op's registers is in the score brackets, the 2410 // value is likely loaded outside of the loop. 2411 if (Brackets.getRegScore(RegNo, LOAD_CNT) > 2412 Brackets.getScoreLB(LOAD_CNT) || 2413 Brackets.getRegScore(RegNo, SAMPLE_CNT) > 2414 Brackets.getScoreLB(SAMPLE_CNT) || 2415 Brackets.getRegScore(RegNo, BVH_CNT) > 2416 Brackets.getScoreLB(BVH_CNT)) { 2417 UsesVgprLoadedOutside = true; 2418 break; 2419 } 2420 } 2421 } 2422 // VMem load vgpr def 2423 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) 2424 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 2425 // If we find a register that is loaded inside the loop, 1. and 2. 2426 // are invalidated and we can exit. 2427 if (VgprUse.contains(RegNo)) 2428 return false; 2429 VgprDef.insert(RegNo); 2430 } 2431 } 2432 } 2433 } 2434 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) 2435 return true; 2436 return HasVMemLoad && UsesVgprLoadedOutside; 2437 } 2438 2439 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 2440 ST = &MF.getSubtarget<GCNSubtarget>(); 2441 TII = ST->getInstrInfo(); 2442 TRI = &TII->getRegisterInfo(); 2443 MRI = &MF.getRegInfo(); 2444 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2445 MLI = &getAnalysis<MachineLoopInfo>(); 2446 PDT = &getAnalysis<MachinePostDominatorTree>(); 2447 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) 2448 AA = &AAR->getAAResults(); 2449 2450 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); 2451 2452 if (ST->hasExtendedWaitCounts()) { 2453 MaxCounter = NUM_EXTENDED_INST_CNTS; 2454 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter); 2455 WCG = &WCGGFX12Plus; 2456 } else { 2457 MaxCounter = NUM_NORMAL_INST_CNTS; 2458 WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST); 2459 WCG = &WCGPreGFX12; 2460 } 2461 2462 ForceEmitZeroWaitcnts = ForceEmitZeroFlag; 2463 for (auto T : inst_counter_types()) 2464 ForceEmitWaitcnt[T] = false; 2465 2466 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); 2467 2468 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); 2469 2470 OptNone = MF.getFunction().hasOptNone() || 2471 MF.getTarget().getOptLevel() == CodeGenOptLevel::None; 2472 2473 HardwareLimits Limits = {}; 2474 if (ST->hasExtendedWaitCounts()) { 2475 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); 2476 Limits.DscntMax = AMDGPU::getDscntBitMask(IV); 2477 } else { 2478 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); 2479 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); 2480 } 2481 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 2482 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); 2483 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); 2484 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); 2485 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); 2486 2487 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); 2488 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); 2489 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 2490 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 2491 2492 RegisterEncoding Encoding = {}; 2493 Encoding.VGPR0 = 2494 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 2495 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; 2496 Encoding.SGPR0 = 2497 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 2498 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; 2499 2500 BlockInfos.clear(); 2501 bool Modified = false; 2502 2503 MachineBasicBlock &EntryBB = MF.front(); 2504 MachineBasicBlock::iterator I = EntryBB.begin(); 2505 2506 if (!MFI->isEntryFunction()) { 2507 // Wait for any outstanding memory operations that the input registers may 2508 // depend on. We can't track them and it's better to do the wait after the 2509 // costly call sequence. 2510 2511 // TODO: Could insert earlier and schedule more liberally with operations 2512 // that only use caller preserved registers. 2513 for (MachineBasicBlock::iterator E = EntryBB.end(); 2514 I != E && (I->isPHI() || I->isMetaInstruction()); ++I) 2515 ; 2516 2517 if (ST->hasExtendedWaitCounts()) { 2518 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) 2519 .addImm(0); 2520 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 2521 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) 2522 continue; 2523 2524 BuildMI(EntryBB, I, DebugLoc(), 2525 TII->get(instrsForExtendedCounterTypes[CT])) 2526 .addImm(0); 2527 } 2528 } else { 2529 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); 2530 } 2531 2532 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( 2533 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, 2534 SmemAccessCounter); 2535 NonKernelInitialState->setStateOnFunctionEntryOrReturn(); 2536 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); 2537 2538 Modified = true; 2539 } 2540 2541 // Keep iterating over the blocks in reverse post order, inserting and 2542 // updating s_waitcnt where needed, until a fix point is reached. 2543 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) 2544 BlockInfos.insert({MBB, BlockInfo()}); 2545 2546 std::unique_ptr<WaitcntBrackets> Brackets; 2547 bool Repeat; 2548 do { 2549 Repeat = false; 2550 2551 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; 2552 ++BII) { 2553 MachineBasicBlock *MBB = BII->first; 2554 BlockInfo &BI = BII->second; 2555 if (!BI.Dirty) 2556 continue; 2557 2558 if (BI.Incoming) { 2559 if (!Brackets) 2560 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); 2561 else 2562 *Brackets = *BI.Incoming; 2563 } else { 2564 if (!Brackets) 2565 Brackets = std::make_unique<WaitcntBrackets>( 2566 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, 2567 SmemAccessCounter); 2568 else 2569 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding, 2570 WaitEventMaskForInst, SmemAccessCounter); 2571 } 2572 2573 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); 2574 BI.Dirty = false; 2575 2576 if (Brackets->hasPendingEvent()) { 2577 BlockInfo *MoveBracketsToSucc = nullptr; 2578 for (MachineBasicBlock *Succ : MBB->successors()) { 2579 auto SuccBII = BlockInfos.find(Succ); 2580 BlockInfo &SuccBI = SuccBII->second; 2581 if (!SuccBI.Incoming) { 2582 SuccBI.Dirty = true; 2583 if (SuccBII <= BII) 2584 Repeat = true; 2585 if (!MoveBracketsToSucc) { 2586 MoveBracketsToSucc = &SuccBI; 2587 } else { 2588 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); 2589 } 2590 } else if (SuccBI.Incoming->merge(*Brackets)) { 2591 SuccBI.Dirty = true; 2592 if (SuccBII <= BII) 2593 Repeat = true; 2594 } 2595 } 2596 if (MoveBracketsToSucc) 2597 MoveBracketsToSucc->Incoming = std::move(Brackets); 2598 } 2599 } 2600 } while (Repeat); 2601 2602 if (ST->hasScalarStores()) { 2603 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 2604 bool HaveScalarStores = false; 2605 2606 for (MachineBasicBlock &MBB : MF) { 2607 for (MachineInstr &MI : MBB) { 2608 if (!HaveScalarStores && TII->isScalarStore(MI)) 2609 HaveScalarStores = true; 2610 2611 if (MI.getOpcode() == AMDGPU::S_ENDPGM || 2612 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 2613 EndPgmBlocks.push_back(&MBB); 2614 } 2615 } 2616 2617 if (HaveScalarStores) { 2618 // If scalar writes are used, the cache must be flushed or else the next 2619 // wave to reuse the same scratch memory can be clobbered. 2620 // 2621 // Insert s_dcache_wb at wave termination points if there were any scalar 2622 // stores, and only if the cache hasn't already been flushed. This could 2623 // be improved by looking across blocks for flushes in postdominating 2624 // blocks from the stores but an explicitly requested flush is probably 2625 // very rare. 2626 for (MachineBasicBlock *MBB : EndPgmBlocks) { 2627 bool SeenDCacheWB = false; 2628 2629 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 2630 I != E; ++I) { 2631 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 2632 SeenDCacheWB = true; 2633 else if (TII->isScalarStore(*I)) 2634 SeenDCacheWB = false; 2635 2636 // FIXME: It would be better to insert this before a waitcnt if any. 2637 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 2638 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 2639 !SeenDCacheWB) { 2640 Modified = true; 2641 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 2642 } 2643 } 2644 } 2645 } 2646 } 2647 2648 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM 2649 // instructions. 2650 for (MachineInstr *MI : ReleaseVGPRInsts) { 2651 if (ST->requiresNopBeforeDeallocVGPRs()) { 2652 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) 2653 .addImm(0); 2654 } 2655 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG)) 2656 .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); 2657 Modified = true; 2658 } 2659 ReleaseVGPRInsts.clear(); 2660 2661 return Modified; 2662 } 2663