1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert wait instructions for memory reads and writes. 11 /// 12 /// Memory reads and writes are issued asynchronously, so we need to insert 13 /// S_WAITCNT instructions when we want to access any of their results or 14 /// overwrite any register that's used asynchronously. 15 /// 16 /// TODO: This pass currently keeps one timeline per hardware counter. A more 17 /// finely-grained approach that keeps one timeline per event type could 18 /// sometimes get away with generating weaker s_waitcnt instructions. For 19 /// example, when both SMEM and LDS are in flight and we need to wait for 20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, 21 /// but the pass will currently generate a conservative lgkmcnt(0) because 22 /// multiple event types are in flight. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPU.h" 27 #include "GCNSubtarget.h" 28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "llvm/ADT/MapVector.h" 32 #include "llvm/ADT/PostOrderIterator.h" 33 #include "llvm/ADT/Sequence.h" 34 #include "llvm/Analysis/AliasAnalysis.h" 35 #include "llvm/CodeGen/MachineLoopInfo.h" 36 #include "llvm/CodeGen/MachinePassManager.h" 37 #include "llvm/CodeGen/MachinePostDominators.h" 38 #include "llvm/Support/DebugCounter.h" 39 #include "llvm/TargetParser/TargetParser.h" 40 using namespace llvm; 41 42 #define DEBUG_TYPE "si-insert-waitcnts" 43 44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp", 45 "Force emit s_waitcnt expcnt(0) instrs"); 46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm", 47 "Force emit s_waitcnt lgkmcnt(0) instrs"); 48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm", 49 "Force emit s_waitcnt vmcnt(0) instrs"); 50 51 static cl::opt<bool> 52 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", 53 cl::desc("Force all waitcnt instrs to be emitted as " 54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), 55 cl::init(false), cl::Hidden); 56 57 static cl::opt<bool> ForceEmitZeroLoadFlag( 58 "amdgpu-waitcnt-load-forcezero", 59 cl::desc("Force all waitcnt load counters to wait until 0"), 60 cl::init(false), cl::Hidden); 61 62 namespace { 63 // Class of object that encapsulates latest instruction counter score 64 // associated with the operand. Used for determining whether 65 // s_waitcnt instruction needs to be emitted. 66 67 enum InstCounterType { 68 LOAD_CNT = 0, // VMcnt prior to gfx12. 69 DS_CNT, // LKGMcnt prior to gfx12. 70 EXP_CNT, // 71 STORE_CNT, // VScnt in gfx10/gfx11. 72 NUM_NORMAL_INST_CNTS, 73 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. 74 BVH_CNT, // gfx12+ only. 75 KM_CNT, // gfx12+ only. 76 X_CNT, // gfx1250. 77 NUM_EXTENDED_INST_CNTS, 78 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS 79 }; 80 } // namespace 81 82 namespace llvm { 83 template <> struct enum_iteration_traits<InstCounterType> { 84 static constexpr bool is_iterable = true; 85 }; 86 } // namespace llvm 87 88 namespace { 89 // Return an iterator over all counters between LOAD_CNT (the first counter) 90 // and \c MaxCounter (exclusive, default value yields an enumeration over 91 // all counters). 92 auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { 93 return enum_seq(LOAD_CNT, MaxCounter); 94 } 95 96 using RegInterval = std::pair<int, int>; 97 98 struct HardwareLimits { 99 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. 100 unsigned ExpcntMax; 101 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. 102 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. 103 unsigned SamplecntMax; // gfx12+ only. 104 unsigned BvhcntMax; // gfx12+ only. 105 unsigned KmcntMax; // gfx12+ only. 106 unsigned XcntMax; // gfx1250. 107 }; 108 109 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \ 110 DECL(VMEM_ACCESS) /* vmem read & write */ \ 111 DECL(VMEM_READ_ACCESS) /* vmem read */ \ 112 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \ 113 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \ 114 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \ 115 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \ 116 DECL(VMEM_GROUP) /* vmem group */ \ 117 DECL(LDS_ACCESS) /* lds read & write */ \ 118 DECL(GDS_ACCESS) /* gds read & write */ \ 119 DECL(SQ_MESSAGE) /* send message */ \ 120 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \ 121 DECL(SMEM_GROUP) /* scalar-memory group */ \ 122 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \ 123 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \ 124 DECL(EXP_POS_ACCESS) /* write to export position */ \ 125 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \ 126 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \ 127 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ 128 129 // clang-format off 130 #define AMDGPU_EVENT_ENUM(Name) Name, 131 enum WaitEventType { 132 AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM) 133 NUM_WAIT_EVENTS 134 }; 135 #undef AMDGPU_EVENT_ENUM 136 137 #define AMDGPU_EVENT_NAME(Name) #Name, 138 static constexpr StringLiteral WaitEventTypeName[] = { 139 AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME) 140 }; 141 #undef AMDGPU_EVENT_NAME 142 // clang-format on 143 144 // The mapping is: 145 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 146 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 147 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 148 // We reserve a fixed number of VGPR slots in the scoring tables for 149 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 150 enum RegisterMapping { 151 SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets. 152 AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets. 153 SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets. 154 // Artificial register slots to track LDS writes into specific LDS locations 155 // if a location is known. When slots are exhausted or location is 156 // unknown use the first slot. The first slot is also always updated in 157 // addition to known location's slot to properly generate waits if dependent 158 // instruction's location is unknown. 159 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores. 160 NUM_LDS_VGPRS = 9, // One more than the stores we track. 161 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start. 162 }; 163 164 // Enumerate different types of result-returning VMEM operations. Although 165 // s_waitcnt orders them all with a single vmcnt counter, in the absence of 166 // s_waitcnt only instructions of the same VmemType are guaranteed to write 167 // their results in order -- so there is no need to insert an s_waitcnt between 168 // two instructions of the same type that write the same vgpr. 169 enum VmemType { 170 // BUF instructions and MIMG instructions without a sampler. 171 VMEM_NOSAMPLER, 172 // MIMG instructions with a sampler. 173 VMEM_SAMPLER, 174 // BVH instructions 175 VMEM_BVH, 176 NUM_VMEM_TYPES 177 }; 178 179 // Maps values of InstCounterType to the instruction that waits on that 180 // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() 181 // returns true. 182 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { 183 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, 184 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, 185 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT}; 186 187 static bool updateVMCntOnly(const MachineInstr &Inst) { 188 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) || 189 SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst); 190 } 191 192 #ifndef NDEBUG 193 static bool isNormalMode(InstCounterType MaxCounter) { 194 return MaxCounter == NUM_NORMAL_INST_CNTS; 195 } 196 #endif // NDEBUG 197 198 VmemType getVmemType(const MachineInstr &Inst) { 199 assert(updateVMCntOnly(Inst)); 200 if (!SIInstrInfo::isImage(Inst)) 201 return VMEM_NOSAMPLER; 202 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); 203 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = 204 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 205 206 if (BaseInfo->BVH) 207 return VMEM_BVH; 208 209 // We have to make an additional check for isVSAMPLE here since some 210 // instructions don't have a sampler, but are still classified as sampler 211 // instructions for the purposes of e.g. waitcnt. 212 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst)) 213 return VMEM_SAMPLER; 214 215 return VMEM_NOSAMPLER; 216 } 217 218 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { 219 switch (T) { 220 case LOAD_CNT: 221 return Wait.LoadCnt; 222 case EXP_CNT: 223 return Wait.ExpCnt; 224 case DS_CNT: 225 return Wait.DsCnt; 226 case STORE_CNT: 227 return Wait.StoreCnt; 228 case SAMPLE_CNT: 229 return Wait.SampleCnt; 230 case BVH_CNT: 231 return Wait.BvhCnt; 232 case KM_CNT: 233 return Wait.KmCnt; 234 case X_CNT: 235 return Wait.XCnt; 236 default: 237 llvm_unreachable("bad InstCounterType"); 238 } 239 } 240 241 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { 242 unsigned &WC = getCounterRef(Wait, T); 243 WC = std::min(WC, Count); 244 } 245 246 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { 247 getCounterRef(Wait, T) = ~0u; 248 } 249 250 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { 251 return getCounterRef(Wait, T); 252 } 253 254 // Mapping from event to counter according to the table masks. 255 InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { 256 for (auto T : inst_counter_types()) { 257 if (masks[T] & (1 << E)) 258 return T; 259 } 260 llvm_unreachable("event type has no associated counter"); 261 } 262 263 // This objects maintains the current score brackets of each wait counter, and 264 // a per-register scoreboard for each wait counter. 265 // 266 // We also maintain the latest score for every event type that can change the 267 // waitcnt in order to know if there are multiple types of events within 268 // the brackets. When multiple types of event happen in the bracket, 269 // wait count may get decreased out of order, therefore we need to put in 270 // "s_waitcnt 0" before use. 271 class WaitcntBrackets { 272 public: 273 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, 274 HardwareLimits Limits, const unsigned *WaitEventMaskForInst, 275 InstCounterType SmemAccessCounter) 276 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), 277 WaitEventMaskForInst(WaitEventMaskForInst), 278 SmemAccessCounter(SmemAccessCounter) {} 279 280 unsigned getWaitCountMax(InstCounterType T) const { 281 switch (T) { 282 case LOAD_CNT: 283 return Limits.LoadcntMax; 284 case DS_CNT: 285 return Limits.DscntMax; 286 case EXP_CNT: 287 return Limits.ExpcntMax; 288 case STORE_CNT: 289 return Limits.StorecntMax; 290 case SAMPLE_CNT: 291 return Limits.SamplecntMax; 292 case BVH_CNT: 293 return Limits.BvhcntMax; 294 case KM_CNT: 295 return Limits.KmcntMax; 296 case X_CNT: 297 return Limits.XcntMax; 298 default: 299 break; 300 } 301 return 0; 302 } 303 304 bool isSmemCounter(InstCounterType T) const { 305 return T == SmemAccessCounter || T == X_CNT; 306 } 307 308 unsigned getSgprScoresIdx(InstCounterType T) const { 309 assert(isSmemCounter(T) && "Invalid SMEM counter"); 310 return T == X_CNT ? 1 : 0; 311 } 312 313 unsigned getScoreLB(InstCounterType T) const { 314 assert(T < NUM_INST_CNTS); 315 return ScoreLBs[T]; 316 } 317 318 unsigned getScoreUB(InstCounterType T) const { 319 assert(T < NUM_INST_CNTS); 320 return ScoreUBs[T]; 321 } 322 323 unsigned getScoreRange(InstCounterType T) const { 324 return getScoreUB(T) - getScoreLB(T); 325 } 326 327 unsigned getRegScore(int GprNo, InstCounterType T) const { 328 if (GprNo < NUM_ALL_VGPRS) 329 return VgprScores[T][GprNo]; 330 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; 331 } 332 333 bool merge(const WaitcntBrackets &Other); 334 335 RegInterval getRegInterval(const MachineInstr *MI, 336 const MachineRegisterInfo *MRI, 337 const SIRegisterInfo *TRI, 338 const MachineOperand &Op) const; 339 340 bool counterOutOfOrder(InstCounterType T) const; 341 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; 342 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; 343 344 void determineWait(InstCounterType T, RegInterval Interval, 345 AMDGPU::Waitcnt &Wait) const; 346 void determineWait(InstCounterType T, int RegNo, 347 AMDGPU::Waitcnt &Wait) const { 348 determineWait(T, {RegNo, RegNo + 1}, Wait); 349 } 350 351 void applyWaitcnt(const AMDGPU::Waitcnt &Wait); 352 void applyWaitcnt(InstCounterType T, unsigned Count); 353 void applyXcnt(const AMDGPU::Waitcnt &Wait); 354 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 355 const MachineRegisterInfo *MRI, WaitEventType E, 356 MachineInstr &MI); 357 358 unsigned hasPendingEvent() const { return PendingEvents; } 359 unsigned hasPendingEvent(WaitEventType E) const { 360 return PendingEvents & (1 << E); 361 } 362 unsigned hasPendingEvent(InstCounterType T) const { 363 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; 364 assert((HasPending != 0) == (getScoreRange(T) != 0)); 365 return HasPending; 366 } 367 368 bool hasMixedPendingEvents(InstCounterType T) const { 369 unsigned Events = hasPendingEvent(T); 370 // Return true if more than one bit is set in Events. 371 return Events & (Events - 1); 372 } 373 374 bool hasPendingFlat() const { 375 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && 376 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || 377 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && 378 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); 379 } 380 381 void setPendingFlat() { 382 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; 383 LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; 384 } 385 386 bool hasPendingGDS() const { 387 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; 388 } 389 390 unsigned getPendingGDSWait() const { 391 return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); 392 } 393 394 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } 395 396 // Return true if there might be pending writes to the vgpr-interval by VMEM 397 // instructions with types different from V. 398 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { 399 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 400 assert(RegNo < NUM_ALL_VGPRS); 401 if (VgprVmemTypes[RegNo] & ~(1 << V)) 402 return true; 403 } 404 return false; 405 } 406 407 void clearVgprVmemTypes(RegInterval Interval) { 408 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 409 assert(RegNo < NUM_ALL_VGPRS); 410 VgprVmemTypes[RegNo] = 0; 411 } 412 } 413 414 void setStateOnFunctionEntryOrReturn() { 415 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); 416 PendingEvents |= WaitEventMaskForInst[STORE_CNT]; 417 } 418 419 ArrayRef<const MachineInstr *> getLDSDMAStores() const { 420 return LDSDMAStores; 421 } 422 423 bool hasPointSampleAccel(const MachineInstr &MI) const; 424 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, 425 RegInterval Interval) const; 426 427 void print(raw_ostream &) const; 428 void dump() const { print(dbgs()); } 429 430 private: 431 struct MergeInfo { 432 unsigned OldLB; 433 unsigned OtherLB; 434 unsigned MyShift; 435 unsigned OtherShift; 436 }; 437 static bool mergeScore(const MergeInfo &M, unsigned &Score, 438 unsigned OtherScore); 439 440 void setScoreLB(InstCounterType T, unsigned Val) { 441 assert(T < NUM_INST_CNTS); 442 ScoreLBs[T] = Val; 443 } 444 445 void setScoreUB(InstCounterType T, unsigned Val) { 446 assert(T < NUM_INST_CNTS); 447 ScoreUBs[T] = Val; 448 449 if (T != EXP_CNT) 450 return; 451 452 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) 453 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); 454 } 455 456 void setRegScore(int GprNo, InstCounterType T, unsigned Val) { 457 setScoreByInterval({GprNo, GprNo + 1}, T, Val); 458 } 459 460 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, 461 unsigned Score); 462 463 void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, 464 const MachineRegisterInfo *MRI, 465 const MachineOperand &Op, InstCounterType CntTy, 466 unsigned Val); 467 468 const GCNSubtarget *ST = nullptr; 469 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; 470 HardwareLimits Limits = {}; 471 const unsigned *WaitEventMaskForInst; 472 InstCounterType SmemAccessCounter; 473 unsigned ScoreLBs[NUM_INST_CNTS] = {0}; 474 unsigned ScoreUBs[NUM_INST_CNTS] = {0}; 475 unsigned PendingEvents = 0; 476 // Remember the last flat memory operation. 477 unsigned LastFlat[NUM_INST_CNTS] = {0}; 478 // Remember the last GDS operation. 479 unsigned LastGDS = 0; 480 // wait_cnt scores for every vgpr. 481 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 482 int VgprUB = -1; 483 int SgprUB = -1; 484 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; 485 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt 486 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. 487 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the 488 // X_CNT score. 489 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; 490 // Bitmask of the VmemTypes of VMEM instructions that might have a pending 491 // write to each vgpr. 492 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; 493 // Store representative LDS DMA operations. The only useful info here is 494 // alias info. One store is kept per unique AAInfo. 495 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; 496 }; 497 498 // This abstracts the logic for generating and updating S_WAIT* instructions 499 // away from the analysis that determines where they are needed. This was 500 // done because the set of counters and instructions for waiting on them 501 // underwent a major shift with gfx12, sufficiently so that having this 502 // abstraction allows the main analysis logic to be simpler than it would 503 // otherwise have had to become. 504 class WaitcntGenerator { 505 protected: 506 const GCNSubtarget *ST = nullptr; 507 const SIInstrInfo *TII = nullptr; 508 AMDGPU::IsaVersion IV; 509 InstCounterType MaxCounter; 510 bool OptNone; 511 512 public: 513 WaitcntGenerator() = default; 514 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) 515 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), 516 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), 517 OptNone(MF.getFunction().hasOptNone() || 518 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} 519 520 // Return true if the current function should be compiled with no 521 // optimization. 522 bool isOptNone() const { return OptNone; } 523 524 // Edits an existing sequence of wait count instructions according 525 // to an incoming Waitcnt value, which is itself updated to reflect 526 // any new wait count instructions which may need to be generated by 527 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits 528 // were made. 529 // 530 // This editing will usually be merely updated operands, but it may also 531 // delete instructions if the incoming Wait value indicates they are not 532 // needed. It may also remove existing instructions for which a wait 533 // is needed if it can be determined that it is better to generate new 534 // instructions later, as can happen on gfx12. 535 virtual bool 536 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 537 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 538 MachineBasicBlock::instr_iterator It) const = 0; 539 540 // Transform a soft waitcnt into a normal one. 541 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; 542 543 // Generates new wait count instructions according to the value of 544 // Wait, returning true if any new instructions were created. 545 virtual bool createNewWaitcnt(MachineBasicBlock &Block, 546 MachineBasicBlock::instr_iterator It, 547 AMDGPU::Waitcnt Wait) = 0; 548 549 // Returns an array of bit masks which can be used to map values in 550 // WaitEventType to corresponding counter values in InstCounterType. 551 virtual const unsigned *getWaitEventMask() const = 0; 552 553 // Returns a new waitcnt with all counters except VScnt set to 0. If 554 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. 555 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; 556 557 virtual ~WaitcntGenerator() = default; 558 559 // Create a mask value from the initializer list of wait event types. 560 static constexpr unsigned 561 eventMask(std::initializer_list<WaitEventType> Events) { 562 unsigned Mask = 0; 563 for (auto &E : Events) 564 Mask |= 1 << E; 565 566 return Mask; 567 } 568 }; 569 570 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { 571 public: 572 WaitcntGeneratorPreGFX12() = default; 573 WaitcntGeneratorPreGFX12(const MachineFunction &MF) 574 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {} 575 576 bool 577 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 578 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 579 MachineBasicBlock::instr_iterator It) const override; 580 581 bool createNewWaitcnt(MachineBasicBlock &Block, 582 MachineBasicBlock::instr_iterator It, 583 AMDGPU::Waitcnt Wait) override; 584 585 const unsigned *getWaitEventMask() const override { 586 assert(ST); 587 588 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { 589 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, 590 VMEM_BVH_READ_ACCESS}), 591 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}), 592 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, 593 EXP_POS_ACCESS, EXP_LDS_ACCESS}), 594 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), 595 0, 596 0, 597 0, 598 0}; 599 600 return WaitEventMaskForInstPreGFX12; 601 } 602 603 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; 604 }; 605 606 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { 607 public: 608 WaitcntGeneratorGFX12Plus() = default; 609 WaitcntGeneratorGFX12Plus(const MachineFunction &MF, 610 InstCounterType MaxCounter) 611 : WaitcntGenerator(MF, MaxCounter) {} 612 613 bool 614 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 615 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, 616 MachineBasicBlock::instr_iterator It) const override; 617 618 bool createNewWaitcnt(MachineBasicBlock &Block, 619 MachineBasicBlock::instr_iterator It, 620 AMDGPU::Waitcnt Wait) override; 621 622 const unsigned *getWaitEventMask() const override { 623 assert(ST); 624 625 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { 626 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}), 627 eventMask({LDS_ACCESS, GDS_ACCESS}), 628 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, 629 EXP_POS_ACCESS, EXP_LDS_ACCESS}), 630 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), 631 eventMask({VMEM_SAMPLER_READ_ACCESS}), 632 eventMask({VMEM_BVH_READ_ACCESS}), 633 eventMask({SMEM_ACCESS, SQ_MESSAGE}), 634 eventMask({VMEM_GROUP, SMEM_GROUP})}; 635 636 return WaitEventMaskForInstGFX12Plus; 637 } 638 639 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; 640 }; 641 642 class SIInsertWaitcnts { 643 private: 644 const GCNSubtarget *ST = nullptr; 645 const SIInstrInfo *TII = nullptr; 646 const SIRegisterInfo *TRI = nullptr; 647 const MachineRegisterInfo *MRI = nullptr; 648 649 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; 650 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; 651 MachineLoopInfo *MLI; 652 MachinePostDominatorTree *PDT; 653 AliasAnalysis *AA = nullptr; 654 655 struct BlockInfo { 656 std::unique_ptr<WaitcntBrackets> Incoming; 657 bool Dirty = true; 658 }; 659 660 InstCounterType SmemAccessCounter; 661 662 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; 663 664 bool ForceEmitWaitcnt[NUM_INST_CNTS]; 665 666 // In any given run of this pass, WCG will point to one of these two 667 // generator objects, which must have been re-initialised before use 668 // from a value made using a subtarget constructor. 669 WaitcntGeneratorPreGFX12 WCGPreGFX12; 670 WaitcntGeneratorGFX12Plus WCGGFX12Plus; 671 672 WaitcntGenerator *WCG = nullptr; 673 674 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS 675 // message. 676 DenseSet<MachineInstr *> ReleaseVGPRInsts; 677 678 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; 679 680 public: 681 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, 682 AliasAnalysis *AA) 683 : MLI(MLI), PDT(PDT), AA(AA) { 684 (void)ForceExpCounter; 685 (void)ForceLgkmCounter; 686 (void)ForceVMCounter; 687 } 688 689 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); 690 bool isPreheaderToFlush(MachineBasicBlock &MBB, 691 const WaitcntBrackets &ScoreBrackets); 692 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; 693 bool run(MachineFunction &MF); 694 695 bool isForceEmitWaitcnt() const { 696 for (auto T : inst_counter_types()) 697 if (ForceEmitWaitcnt[T]) 698 return true; 699 return false; 700 } 701 702 void setForceEmitWaitcnt() { 703 // For non-debug builds, ForceEmitWaitcnt has been initialized to false; 704 // For debug builds, get the debug counter info and adjust if need be 705 #ifndef NDEBUG 706 if (DebugCounter::isCounterSet(ForceExpCounter) && 707 DebugCounter::shouldExecute(ForceExpCounter)) { 708 ForceEmitWaitcnt[EXP_CNT] = true; 709 } else { 710 ForceEmitWaitcnt[EXP_CNT] = false; 711 } 712 713 if (DebugCounter::isCounterSet(ForceLgkmCounter) && 714 DebugCounter::shouldExecute(ForceLgkmCounter)) { 715 ForceEmitWaitcnt[DS_CNT] = true; 716 ForceEmitWaitcnt[KM_CNT] = true; 717 } else { 718 ForceEmitWaitcnt[DS_CNT] = false; 719 ForceEmitWaitcnt[KM_CNT] = false; 720 } 721 722 if (DebugCounter::isCounterSet(ForceVMCounter) && 723 DebugCounter::shouldExecute(ForceVMCounter)) { 724 ForceEmitWaitcnt[LOAD_CNT] = true; 725 ForceEmitWaitcnt[SAMPLE_CNT] = true; 726 ForceEmitWaitcnt[BVH_CNT] = true; 727 } else { 728 ForceEmitWaitcnt[LOAD_CNT] = false; 729 ForceEmitWaitcnt[SAMPLE_CNT] = false; 730 ForceEmitWaitcnt[BVH_CNT] = false; 731 } 732 #endif // NDEBUG 733 } 734 735 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM 736 // instruction. 737 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { 738 switch (Inst.getOpcode()) { 739 case AMDGPU::GLOBAL_INV: 740 return VMEM_READ_ACCESS; // tracked using loadcnt 741 case AMDGPU::GLOBAL_WB: 742 case AMDGPU::GLOBAL_WBINV: 743 return VMEM_WRITE_ACCESS; // tracked using storecnt 744 default: 745 break; 746 } 747 748 // Maps VMEM access types to their corresponding WaitEventType. 749 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { 750 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; 751 752 assert(SIInstrInfo::isVMEM(Inst)); 753 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side 754 // these should use VM_CNT. 755 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) 756 return VMEM_ACCESS; 757 if (Inst.mayStore() && 758 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { 759 // FLAT and SCRATCH instructions may access scratch. Other VMEM 760 // instructions do not. 761 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) 762 return SCRATCH_WRITE_ACCESS; 763 return VMEM_WRITE_ACCESS; 764 } 765 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) 766 return VMEM_READ_ACCESS; 767 return VmemReadMapping[getVmemType(Inst)]; 768 } 769 770 bool hasXcnt() const { return ST->hasWaitXCnt(); } 771 772 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; 773 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 774 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; 775 bool isVmemAccess(const MachineInstr &MI) const; 776 bool generateWaitcntInstBefore(MachineInstr &MI, 777 WaitcntBrackets &ScoreBrackets, 778 MachineInstr *OldWaitcntInstr, 779 bool FlushVmCnt); 780 bool generateWaitcnt(AMDGPU::Waitcnt Wait, 781 MachineBasicBlock::instr_iterator It, 782 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, 783 MachineInstr *OldWaitcntInstr); 784 void updateEventWaitcntAfter(MachineInstr &Inst, 785 WaitcntBrackets *ScoreBrackets); 786 bool isNextENDPGM(MachineBasicBlock::instr_iterator It, 787 MachineBasicBlock *Block) const; 788 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block, 789 WaitcntBrackets &ScoreBrackets); 790 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, 791 WaitcntBrackets &ScoreBrackets); 792 }; 793 794 class SIInsertWaitcntsLegacy : public MachineFunctionPass { 795 public: 796 static char ID; 797 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {} 798 799 bool runOnMachineFunction(MachineFunction &MF) override; 800 801 StringRef getPassName() const override { 802 return "SI insert wait instructions"; 803 } 804 805 void getAnalysisUsage(AnalysisUsage &AU) const override { 806 AU.setPreservesCFG(); 807 AU.addRequired<MachineLoopInfoWrapperPass>(); 808 AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 809 AU.addUsedIfAvailable<AAResultsWrapperPass>(); 810 AU.addPreserved<AAResultsWrapperPass>(); 811 MachineFunctionPass::getAnalysisUsage(AU); 812 } 813 }; 814 815 } // end anonymous namespace 816 817 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, 818 const MachineRegisterInfo *MRI, 819 const SIRegisterInfo *TRI, 820 const MachineOperand &Op) const { 821 if (!TRI->isInAllocatableClass(Op.getReg())) 822 return {-1, -1}; 823 824 // A use via a PW operand does not need a waitcnt. 825 // A partial write is not a WAW. 826 assert(!Op.getSubReg() || !Op.isUndef()); 827 828 RegInterval Result; 829 830 MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); 831 unsigned RegIdx = TRI->getHWRegIndex(MCReg); 832 assert(isUInt<8>(RegIdx)); 833 834 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); 835 unsigned Size = TRI->getRegSizeInBits(*RC); 836 837 // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits 838 if (TRI->isVectorRegister(*MRI, Op.getReg())) { 839 unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); 840 assert(Reg < AGPR_OFFSET); 841 Result.first = Reg; 842 if (TRI->isAGPR(*MRI, Op.getReg())) 843 Result.first += AGPR_OFFSET; 844 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 845 assert(Size % 16 == 0); 846 Result.second = Result.first + (Size / 16); 847 } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) { 848 // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar 849 // sources like SRC_PRIVATE_BASE. 850 Result.first = RegIdx + NUM_ALL_VGPRS; 851 Result.second = Result.first + divideCeil(Size, 32); 852 } else { 853 return {-1, -1}; 854 } 855 856 return Result; 857 } 858 859 void WaitcntBrackets::setScoreByInterval(RegInterval Interval, 860 InstCounterType CntTy, 861 unsigned Score) { 862 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 863 if (RegNo < NUM_ALL_VGPRS) { 864 VgprUB = std::max(VgprUB, RegNo); 865 VgprScores[CntTy][RegNo] = Score; 866 } else { 867 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS); 868 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score; 869 } 870 } 871 } 872 873 void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, 874 const SIRegisterInfo *TRI, 875 const MachineRegisterInfo *MRI, 876 const MachineOperand &Op, 877 InstCounterType CntTy, unsigned Score) { 878 RegInterval Interval = getRegInterval(MI, MRI, TRI, Op); 879 setScoreByInterval(Interval, CntTy, Score); 880 } 881 882 // Return true if the subtarget is one that enables Point Sample Acceleration 883 // and the MachineInstr passed in is one to which it might be applied (the 884 // hardware makes this decision based on several factors, but we can't determine 885 // this at compile time, so we have to assume it might be applied if the 886 // instruction supports it). 887 bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { 888 if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) 889 return false; 890 891 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 892 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = 893 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 894 return BaseInfo->PointSampleAccel; 895 } 896 897 // Return true if the subtarget enables Point Sample Acceleration, the supplied 898 // MachineInstr is one to which it might be applied and the supplied interval is 899 // one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER 900 // (this is the type that a point sample accelerated instruction effectively 901 // becomes) 902 bool WaitcntBrackets::hasPointSamplePendingVmemTypes( 903 const MachineInstr &MI, RegInterval Interval) const { 904 if (!hasPointSampleAccel(MI)) 905 return false; 906 907 return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER); 908 } 909 910 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 911 const SIRegisterInfo *TRI, 912 const MachineRegisterInfo *MRI, 913 WaitEventType E, MachineInstr &Inst) { 914 InstCounterType T = eventCounter(WaitEventMaskForInst, E); 915 916 unsigned UB = getScoreUB(T); 917 unsigned CurrScore = UB + 1; 918 if (CurrScore == 0) 919 report_fatal_error("InsertWaitcnt score wraparound"); 920 // PendingEvents and ScoreUB need to be update regardless if this event 921 // changes the score of a register or not. 922 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 923 PendingEvents |= 1 << E; 924 setScoreUB(T, CurrScore); 925 926 if (T == EXP_CNT) { 927 // Put score on the source vgprs. If this is a store, just use those 928 // specific register(s). 929 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) { 930 // All GDS operations must protect their address register (same as 931 // export.) 932 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr)) 933 setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore); 934 935 if (Inst.mayStore()) { 936 if (const auto *Data0 = 937 TII->getNamedOperand(Inst, AMDGPU::OpName::data0)) 938 setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore); 939 if (const auto *Data1 = 940 TII->getNamedOperand(Inst, AMDGPU::OpName::data1)) 941 setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore); 942 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && 943 Inst.getOpcode() != AMDGPU::DS_APPEND && 944 Inst.getOpcode() != AMDGPU::DS_CONSUME && 945 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 946 for (const MachineOperand &Op : Inst.all_uses()) { 947 if (TRI->isVectorRegister(*MRI, Op.getReg())) 948 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore); 949 } 950 } 951 } else if (TII->isFLAT(Inst)) { 952 if (Inst.mayStore()) { 953 setScoreByOperand(&Inst, TRI, MRI, 954 *TII->getNamedOperand(Inst, AMDGPU::OpName::data), 955 EXP_CNT, CurrScore); 956 } else if (SIInstrInfo::isAtomicRet(Inst)) { 957 setScoreByOperand(&Inst, TRI, MRI, 958 *TII->getNamedOperand(Inst, AMDGPU::OpName::data), 959 EXP_CNT, CurrScore); 960 } 961 } else if (TII->isMIMG(Inst)) { 962 if (Inst.mayStore()) { 963 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT, 964 CurrScore); 965 } else if (SIInstrInfo::isAtomicRet(Inst)) { 966 setScoreByOperand(&Inst, TRI, MRI, 967 *TII->getNamedOperand(Inst, AMDGPU::OpName::data), 968 EXP_CNT, CurrScore); 969 } 970 } else if (TII->isMTBUF(Inst)) { 971 if (Inst.mayStore()) 972 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT, 973 CurrScore); 974 } else if (TII->isMUBUF(Inst)) { 975 if (Inst.mayStore()) { 976 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT, 977 CurrScore); 978 } else if (SIInstrInfo::isAtomicRet(Inst)) { 979 setScoreByOperand(&Inst, TRI, MRI, 980 *TII->getNamedOperand(Inst, AMDGPU::OpName::data), 981 EXP_CNT, CurrScore); 982 } 983 } else if (TII->isLDSDIR(Inst)) { 984 // LDSDIR instructions attach the score to the destination. 985 setScoreByOperand(&Inst, TRI, MRI, 986 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), 987 EXP_CNT, CurrScore); 988 } else { 989 if (TII->isEXP(Inst)) { 990 // For export the destination registers are really temps that 991 // can be used as the actual source after export patching, so 992 // we need to treat them like sources and set the EXP_CNT 993 // score. 994 for (MachineOperand &DefMO : Inst.all_defs()) { 995 if (TRI->isVGPR(*MRI, DefMO.getReg())) { 996 setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore); 997 } 998 } 999 } 1000 for (const MachineOperand &Op : Inst.all_uses()) { 1001 if (TRI->isVectorRegister(*MRI, Op.getReg())) 1002 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore); 1003 } 1004 } 1005 } else if (T == X_CNT) { 1006 for (const MachineOperand &Op : Inst.all_uses()) 1007 setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore); 1008 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { 1009 // Match the score to the destination registers. 1010 // 1011 // Check only explicit operands. Stores, especially spill stores, include 1012 // implicit uses and defs of their super registers which would create an 1013 // artificial dependency, while these are there only for register liveness 1014 // accounting purposes. 1015 // 1016 // Special cases where implicit register defs exists, such as M0 or VCC, 1017 // but none with memory instructions. 1018 for (const MachineOperand &Op : Inst.defs()) { 1019 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op); 1020 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { 1021 if (Interval.first >= NUM_ALL_VGPRS) 1022 continue; 1023 if (updateVMCntOnly(Inst)) { 1024 // updateVMCntOnly should only leave us with VGPRs 1025 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR 1026 // defs. That's required for a sane index into `VgprMemTypes` below 1027 assert(TRI->isVectorRegister(*MRI, Op.getReg())); 1028 VmemType V = getVmemType(Inst); 1029 unsigned char TypesMask = 1 << V; 1030 // If instruction can have Point Sample Accel applied, we have to flag 1031 // this with another potential dependency 1032 if (hasPointSampleAccel(Inst)) 1033 TypesMask |= 1 << VMEM_NOSAMPLER; 1034 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) 1035 VgprVmemTypes[RegNo] |= TypesMask; 1036 } 1037 } 1038 setScoreByInterval(Interval, T, CurrScore); 1039 } 1040 if (Inst.mayStore() && 1041 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { 1042 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS 1043 // written can be accessed. A load from LDS to VMEM does not need a wait. 1044 unsigned Slot = 0; 1045 for (const auto *MemOp : Inst.memoperands()) { 1046 if (!MemOp->isStore() || 1047 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) 1048 continue; 1049 // Comparing just AA info does not guarantee memoperands are equal 1050 // in general, but this is so for LDS DMA in practice. 1051 auto AAI = MemOp->getAAInfo(); 1052 // Alias scope information gives a way to definitely identify an 1053 // original memory object and practically produced in the module LDS 1054 // lowering pass. If there is no scope available we will not be able 1055 // to disambiguate LDS aliasing as after the module lowering all LDS 1056 // is squashed into a single big object. Do not attempt to use one of 1057 // the limited LDSDMAStores for something we will not be able to use 1058 // anyway. 1059 if (!AAI || !AAI.Scope) 1060 break; 1061 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { 1062 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { 1063 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { 1064 Slot = I + 1; 1065 break; 1066 } 1067 } 1068 } 1069 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1) 1070 break; 1071 LDSDMAStores.push_back(&Inst); 1072 Slot = LDSDMAStores.size(); 1073 break; 1074 } 1075 setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); 1076 if (Slot) 1077 setRegScore(FIRST_LDS_VGPR, T, CurrScore); 1078 } 1079 } 1080 } 1081 1082 void WaitcntBrackets::print(raw_ostream &OS) const { 1083 OS << '\n'; 1084 for (auto T : inst_counter_types(MaxCounter)) { 1085 unsigned SR = getScoreRange(T); 1086 1087 switch (T) { 1088 case LOAD_CNT: 1089 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" 1090 << SR << "): "; 1091 break; 1092 case DS_CNT: 1093 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" 1094 << SR << "): "; 1095 break; 1096 case EXP_CNT: 1097 OS << " EXP_CNT(" << SR << "): "; 1098 break; 1099 case STORE_CNT: 1100 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" 1101 << SR << "): "; 1102 break; 1103 case SAMPLE_CNT: 1104 OS << " SAMPLE_CNT(" << SR << "): "; 1105 break; 1106 case BVH_CNT: 1107 OS << " BVH_CNT(" << SR << "): "; 1108 break; 1109 case KM_CNT: 1110 OS << " KM_CNT(" << SR << "): "; 1111 break; 1112 case X_CNT: 1113 OS << " X_CNT(" << SR << "): "; 1114 break; 1115 default: 1116 OS << " UNKNOWN(" << SR << "): "; 1117 break; 1118 } 1119 1120 if (SR != 0) { 1121 // Print vgpr scores. 1122 unsigned LB = getScoreLB(T); 1123 1124 for (int J = 0; J <= VgprUB; J++) { 1125 unsigned RegScore = getRegScore(J, T); 1126 if (RegScore <= LB) 1127 continue; 1128 unsigned RelScore = RegScore - LB - 1; 1129 if (J < FIRST_LDS_VGPR) { 1130 OS << RelScore << ":v" << J << " "; 1131 } else { 1132 OS << RelScore << ":ds "; 1133 } 1134 } 1135 // Also need to print sgpr scores for lgkm_cnt or xcnt. 1136 if (isSmemCounter(T)) { 1137 for (int J = 0; J <= SgprUB; J++) { 1138 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); 1139 if (RegScore <= LB) 1140 continue; 1141 unsigned RelScore = RegScore - LB - 1; 1142 OS << RelScore << ":s" << J << " "; 1143 } 1144 } 1145 } 1146 OS << '\n'; 1147 } 1148 1149 OS << "Pending Events: "; 1150 if (hasPendingEvent()) { 1151 ListSeparator LS; 1152 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) { 1153 if (hasPendingEvent((WaitEventType)I)) { 1154 OS << LS << WaitEventTypeName[I]; 1155 } 1156 } 1157 } else { 1158 OS << "none"; 1159 } 1160 OS << '\n'; 1161 1162 OS << '\n'; 1163 } 1164 1165 /// Simplify the waitcnt, in the sense of removing redundant counts, and return 1166 /// whether a waitcnt instruction is needed at all. 1167 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { 1168 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1169 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); 1170 simplifyWaitcnt(DS_CNT, Wait.DsCnt); 1171 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); 1172 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); 1173 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); 1174 simplifyWaitcnt(KM_CNT, Wait.KmCnt); 1175 simplifyWaitcnt(X_CNT, Wait.XCnt); 1176 } 1177 1178 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, 1179 unsigned &Count) const { 1180 // The number of outstanding events for this type, T, can be calculated 1181 // as (UB - LB). If the current Count is greater than or equal to the number 1182 // of outstanding events, then the wait for this counter is redundant. 1183 if (Count >= getScoreRange(T)) 1184 Count = ~0u; 1185 } 1186 1187 void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, 1188 AMDGPU::Waitcnt &Wait) const { 1189 const unsigned LB = getScoreLB(T); 1190 const unsigned UB = getScoreUB(T); 1191 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1192 unsigned ScoreToWait = getRegScore(RegNo, T); 1193 1194 // If the score of src_operand falls within the bracket, we need an 1195 // s_waitcnt instruction. 1196 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 1197 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && 1198 !ST->hasFlatLgkmVMemCountInOrder()) { 1199 // If there is a pending FLAT operation, and this is a VMem or LGKM 1200 // waitcnt and the target can report early completion, then we need 1201 // to force a waitcnt 0. 1202 addWait(Wait, T, 0); 1203 } else if (counterOutOfOrder(T)) { 1204 // Counter can get decremented out-of-order when there 1205 // are multiple types event in the bracket. Also emit an s_wait counter 1206 // with a conservative value of 0 for the counter. 1207 addWait(Wait, T, 0); 1208 } else { 1209 // If a counter has been maxed out avoid overflow by waiting for 1210 // MAX(CounterType) - 1 instead. 1211 unsigned NeededWait = 1212 std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); 1213 addWait(Wait, T, NeededWait); 1214 } 1215 } 1216 } 1217 } 1218 1219 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { 1220 applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1221 applyWaitcnt(EXP_CNT, Wait.ExpCnt); 1222 applyWaitcnt(DS_CNT, Wait.DsCnt); 1223 applyWaitcnt(STORE_CNT, Wait.StoreCnt); 1224 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); 1225 applyWaitcnt(BVH_CNT, Wait.BvhCnt); 1226 applyWaitcnt(KM_CNT, Wait.KmCnt); 1227 applyXcnt(Wait); 1228 } 1229 1230 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { 1231 const unsigned UB = getScoreUB(T); 1232 if (Count >= UB) 1233 return; 1234 if (Count != 0) { 1235 if (counterOutOfOrder(T)) 1236 return; 1237 setScoreLB(T, std::max(getScoreLB(T), UB - Count)); 1238 } else { 1239 setScoreLB(T, UB); 1240 PendingEvents &= ~WaitEventMaskForInst[T]; 1241 } 1242 } 1243 1244 void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { 1245 // Wait on XCNT is redundant if we are already waiting for a load to complete. 1246 // SMEM can return out of order, so only omit XCNT wait if we are waiting till 1247 // zero. 1248 if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) 1249 return applyWaitcnt(X_CNT, 0); 1250 1251 // If we have pending store we cannot optimize XCnt because we do not wait for 1252 // stores. VMEM loads retun in order, so if we only have loads XCnt is 1253 // decremented to the same number as LOADCnt. 1254 if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && 1255 !hasPendingEvent(STORE_CNT)) 1256 return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); 1257 1258 applyWaitcnt(X_CNT, Wait.XCnt); 1259 } 1260 1261 // Where there are multiple types of event in the bracket of a counter, 1262 // the decrement may go out of order. 1263 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { 1264 // Scalar memory read always can go out of order. 1265 if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || 1266 (T == X_CNT && hasPendingEvent(SMEM_GROUP))) 1267 return true; 1268 return hasMixedPendingEvents(T); 1269 } 1270 1271 INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", 1272 false, false) 1273 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) 1274 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 1275 INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", 1276 false, false) 1277 1278 char SIInsertWaitcntsLegacy::ID = 0; 1279 1280 char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID; 1281 1282 FunctionPass *llvm::createSIInsertWaitcntsPass() { 1283 return new SIInsertWaitcntsLegacy(); 1284 } 1285 1286 static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, 1287 unsigned NewEnc) { 1288 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 1289 assert(OpIdx >= 0); 1290 1291 MachineOperand &MO = MI.getOperand(OpIdx); 1292 1293 if (NewEnc == MO.getImm()) 1294 return false; 1295 1296 MO.setImm(NewEnc); 1297 return true; 1298 } 1299 1300 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, 1301 /// and if so, which counter it is waiting on. 1302 static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { 1303 switch (Opcode) { 1304 case AMDGPU::S_WAIT_LOADCNT: 1305 return LOAD_CNT; 1306 case AMDGPU::S_WAIT_EXPCNT: 1307 return EXP_CNT; 1308 case AMDGPU::S_WAIT_STORECNT: 1309 return STORE_CNT; 1310 case AMDGPU::S_WAIT_SAMPLECNT: 1311 return SAMPLE_CNT; 1312 case AMDGPU::S_WAIT_BVHCNT: 1313 return BVH_CNT; 1314 case AMDGPU::S_WAIT_DSCNT: 1315 return DS_CNT; 1316 case AMDGPU::S_WAIT_KMCNT: 1317 return KM_CNT; 1318 case AMDGPU::S_WAIT_XCNT: 1319 return X_CNT; 1320 default: 1321 return {}; 1322 } 1323 } 1324 1325 bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { 1326 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode()); 1327 if (Opcode == Waitcnt->getOpcode()) 1328 return false; 1329 1330 Waitcnt->setDesc(TII->get(Opcode)); 1331 return true; 1332 } 1333 1334 /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that 1335 /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits 1336 /// from \p Wait that were added by previous passes. Currently this pass 1337 /// conservatively assumes that these preexisting waits are required for 1338 /// correctness. 1339 bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( 1340 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 1341 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 1342 assert(ST); 1343 assert(isNormalMode(MaxCounter)); 1344 1345 bool Modified = false; 1346 MachineInstr *WaitcntInstr = nullptr; 1347 MachineInstr *WaitcntVsCntInstr = nullptr; 1348 1349 LLVM_DEBUG({ 1350 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: "; 1351 if (It == OldWaitcntInstr.getParent()->instr_end()) 1352 dbgs() << "end of block\n"; 1353 else 1354 dbgs() << *It; 1355 }); 1356 1357 for (auto &II : 1358 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 1359 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II); 1360 if (II.isMetaInstruction()) { 1361 LLVM_DEBUG(dbgs() << "skipped meta instruction\n"); 1362 continue; 1363 } 1364 1365 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); 1366 bool TrySimplify = Opcode != II.getOpcode() && !OptNone; 1367 1368 // Update required wait count. If this is a soft waitcnt (= it was added 1369 // by an earlier pass), it may be entirely removed. 1370 if (Opcode == AMDGPU::S_WAITCNT) { 1371 unsigned IEnc = II.getOperand(0).getImm(); 1372 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); 1373 if (TrySimplify) 1374 ScoreBrackets.simplifyWaitcnt(OldWait); 1375 Wait = Wait.combined(OldWait); 1376 1377 // Merge consecutive waitcnt of the same type by erasing multiples. 1378 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { 1379 II.eraseFromParent(); 1380 Modified = true; 1381 } else 1382 WaitcntInstr = &II; 1383 } else { 1384 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); 1385 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1386 1387 unsigned OldVSCnt = 1388 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1389 if (TrySimplify) 1390 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); 1391 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); 1392 1393 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) { 1394 II.eraseFromParent(); 1395 Modified = true; 1396 } else 1397 WaitcntVsCntInstr = &II; 1398 } 1399 } 1400 1401 if (WaitcntInstr) { 1402 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, 1403 AMDGPU::encodeWaitcnt(IV, Wait)); 1404 Modified |= promoteSoftWaitCnt(WaitcntInstr); 1405 1406 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1407 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); 1408 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 1409 Wait.LoadCnt = ~0u; 1410 Wait.ExpCnt = ~0u; 1411 Wait.DsCnt = ~0u; 1412 1413 LLVM_DEBUG(It == WaitcntInstr->getParent()->end() 1414 ? dbgs() 1415 << "applied pre-existing waitcnt\n" 1416 << "New Instr at block end: " << *WaitcntInstr << '\n' 1417 : dbgs() << "applied pre-existing waitcnt\n" 1418 << "Old Instr: " << *It 1419 << "New Instr: " << *WaitcntInstr << '\n'); 1420 } 1421 1422 if (WaitcntVsCntInstr) { 1423 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, 1424 AMDGPU::OpName::simm16, Wait.StoreCnt); 1425 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); 1426 1427 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); 1428 Wait.StoreCnt = ~0u; 1429 1430 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() 1431 ? dbgs() << "applied pre-existing waitcnt\n" 1432 << "New Instr at block end: " << *WaitcntVsCntInstr 1433 << '\n' 1434 : dbgs() << "applied pre-existing waitcnt\n" 1435 << "Old Instr: " << *It 1436 << "New Instr: " << *WaitcntVsCntInstr << '\n'); 1437 } 1438 1439 return Modified; 1440 } 1441 1442 /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any 1443 /// required counters in \p Wait 1444 bool WaitcntGeneratorPreGFX12::createNewWaitcnt( 1445 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, 1446 AMDGPU::Waitcnt Wait) { 1447 assert(ST); 1448 assert(isNormalMode(MaxCounter)); 1449 1450 bool Modified = false; 1451 const DebugLoc &DL = Block.findDebugLoc(It); 1452 1453 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a 1454 // single instruction while VScnt has its own instruction. 1455 if (Wait.hasWaitExceptStoreCnt()) { 1456 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); 1457 [[maybe_unused]] auto SWaitInst = 1458 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); 1459 Modified = true; 1460 1461 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1462 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1463 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1464 } 1465 1466 if (Wait.hasWaitStoreCnt()) { 1467 assert(ST->hasVscnt()); 1468 1469 [[maybe_unused]] auto SWaitInst = 1470 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1471 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1472 .addImm(Wait.StoreCnt); 1473 Modified = true; 1474 1475 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1476 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1477 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1478 } 1479 1480 return Modified; 1481 } 1482 1483 AMDGPU::Waitcnt 1484 WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { 1485 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); 1486 } 1487 1488 AMDGPU::Waitcnt 1489 WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { 1490 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0, 1491 ~0u /* XCNT */); 1492 } 1493 1494 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and 1495 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that 1496 /// were added by previous passes. Currently this pass conservatively 1497 /// assumes that these preexisting waits are required for correctness. 1498 bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( 1499 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 1500 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 1501 assert(ST); 1502 assert(!isNormalMode(MaxCounter)); 1503 1504 bool Modified = false; 1505 MachineInstr *CombinedLoadDsCntInstr = nullptr; 1506 MachineInstr *CombinedStoreDsCntInstr = nullptr; 1507 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; 1508 1509 LLVM_DEBUG({ 1510 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: "; 1511 if (It == OldWaitcntInstr.getParent()->instr_end()) 1512 dbgs() << "end of block\n"; 1513 else 1514 dbgs() << *It; 1515 }); 1516 1517 for (auto &II : 1518 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 1519 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II); 1520 if (II.isMetaInstruction()) { 1521 LLVM_DEBUG(dbgs() << "skipped meta instruction\n"); 1522 continue; 1523 } 1524 1525 MachineInstr **UpdatableInstr; 1526 1527 // Update required wait count. If this is a soft waitcnt (= it was added 1528 // by an earlier pass), it may be entirely removed. 1529 1530 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); 1531 bool TrySimplify = Opcode != II.getOpcode() && !OptNone; 1532 1533 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't 1534 // attempt to do more than that either. 1535 if (Opcode == AMDGPU::S_WAITCNT) 1536 continue; 1537 1538 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { 1539 unsigned OldEnc = 1540 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1541 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); 1542 if (TrySimplify) 1543 ScoreBrackets.simplifyWaitcnt(OldWait); 1544 Wait = Wait.combined(OldWait); 1545 UpdatableInstr = &CombinedLoadDsCntInstr; 1546 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { 1547 unsigned OldEnc = 1548 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1549 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); 1550 if (TrySimplify) 1551 ScoreBrackets.simplifyWaitcnt(OldWait); 1552 Wait = Wait.combined(OldWait); 1553 UpdatableInstr = &CombinedStoreDsCntInstr; 1554 } else { 1555 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); 1556 assert(CT.has_value()); 1557 unsigned OldCnt = 1558 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 1559 if (TrySimplify) 1560 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); 1561 addWait(Wait, CT.value(), OldCnt); 1562 UpdatableInstr = &WaitInstrs[CT.value()]; 1563 } 1564 1565 // Merge consecutive waitcnt of the same type by erasing multiples. 1566 if (!*UpdatableInstr) { 1567 *UpdatableInstr = &II; 1568 } else { 1569 II.eraseFromParent(); 1570 Modified = true; 1571 } 1572 } 1573 1574 if (CombinedLoadDsCntInstr) { 1575 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need 1576 // to be waited for. Otherwise, let the instruction be deleted so 1577 // the appropriate single counter wait instruction can be inserted 1578 // instead, when new S_WAIT_*CNT instructions are inserted by 1579 // createNewWaitcnt(). As a side effect, resetting the wait counts will 1580 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by 1581 // the loop below that deals with single counter instructions. 1582 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { 1583 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); 1584 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, 1585 AMDGPU::OpName::simm16, NewEnc); 1586 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr); 1587 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); 1588 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 1589 Wait.LoadCnt = ~0u; 1590 Wait.DsCnt = ~0u; 1591 1592 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 1593 ? dbgs() << "applied pre-existing waitcnt\n" 1594 << "New Instr at block end: " 1595 << *CombinedLoadDsCntInstr << '\n' 1596 : dbgs() << "applied pre-existing waitcnt\n" 1597 << "Old Instr: " << *It << "New Instr: " 1598 << *CombinedLoadDsCntInstr << '\n'); 1599 } else { 1600 CombinedLoadDsCntInstr->eraseFromParent(); 1601 Modified = true; 1602 } 1603 } 1604 1605 if (CombinedStoreDsCntInstr) { 1606 // Similarly for S_WAIT_STORECNT_DSCNT. 1607 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { 1608 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait); 1609 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr, 1610 AMDGPU::OpName::simm16, NewEnc); 1611 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr); 1612 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); 1613 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); 1614 Wait.StoreCnt = ~0u; 1615 Wait.DsCnt = ~0u; 1616 1617 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 1618 ? dbgs() << "applied pre-existing waitcnt\n" 1619 << "New Instr at block end: " 1620 << *CombinedStoreDsCntInstr << '\n' 1621 : dbgs() << "applied pre-existing waitcnt\n" 1622 << "Old Instr: " << *It << "New Instr: " 1623 << *CombinedStoreDsCntInstr << '\n'); 1624 } else { 1625 CombinedStoreDsCntInstr->eraseFromParent(); 1626 Modified = true; 1627 } 1628 } 1629 1630 // Look for an opportunity to convert existing S_WAIT_LOADCNT, 1631 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT 1632 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing 1633 // instructions so that createNewWaitcnt() will create new combined 1634 // instructions to replace them. 1635 1636 if (Wait.DsCnt != ~0u) { 1637 // This is a vector of addresses in WaitInstrs pointing to instructions 1638 // that should be removed if they are present. 1639 SmallVector<MachineInstr **, 2> WaitsToErase; 1640 1641 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not 1642 // both) need to be waited for, ensure that there are no existing 1643 // individual wait count instructions for these. 1644 1645 if (Wait.LoadCnt != ~0u) { 1646 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]); 1647 WaitsToErase.push_back(&WaitInstrs[DS_CNT]); 1648 } else if (Wait.StoreCnt != ~0u) { 1649 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]); 1650 WaitsToErase.push_back(&WaitInstrs[DS_CNT]); 1651 } 1652 1653 for (MachineInstr **WI : WaitsToErase) { 1654 if (!*WI) 1655 continue; 1656 1657 (*WI)->eraseFromParent(); 1658 *WI = nullptr; 1659 Modified = true; 1660 } 1661 } 1662 1663 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 1664 if (!WaitInstrs[CT]) 1665 continue; 1666 1667 unsigned NewCnt = getWait(Wait, CT); 1668 if (NewCnt != ~0u) { 1669 Modified |= updateOperandIfDifferent(*WaitInstrs[CT], 1670 AMDGPU::OpName::simm16, NewCnt); 1671 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]); 1672 1673 ScoreBrackets.applyWaitcnt(CT, NewCnt); 1674 setNoWait(Wait, CT); 1675 1676 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 1677 ? dbgs() << "applied pre-existing waitcnt\n" 1678 << "New Instr at block end: " << *WaitInstrs[CT] 1679 << '\n' 1680 : dbgs() << "applied pre-existing waitcnt\n" 1681 << "Old Instr: " << *It 1682 << "New Instr: " << *WaitInstrs[CT] << '\n'); 1683 } else { 1684 WaitInstrs[CT]->eraseFromParent(); 1685 Modified = true; 1686 } 1687 } 1688 1689 return Modified; 1690 } 1691 1692 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait 1693 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( 1694 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, 1695 AMDGPU::Waitcnt Wait) { 1696 assert(ST); 1697 assert(!isNormalMode(MaxCounter)); 1698 1699 bool Modified = false; 1700 const DebugLoc &DL = Block.findDebugLoc(It); 1701 1702 // Check for opportunities to use combined wait instructions. 1703 if (Wait.DsCnt != ~0u) { 1704 MachineInstr *SWaitInst = nullptr; 1705 1706 if (Wait.LoadCnt != ~0u) { 1707 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); 1708 1709 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) 1710 .addImm(Enc); 1711 1712 Wait.LoadCnt = ~0u; 1713 Wait.DsCnt = ~0u; 1714 } else if (Wait.StoreCnt != ~0u) { 1715 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); 1716 1717 SWaitInst = 1718 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) 1719 .addImm(Enc); 1720 1721 Wait.StoreCnt = ~0u; 1722 Wait.DsCnt = ~0u; 1723 } 1724 1725 if (SWaitInst) { 1726 Modified = true; 1727 1728 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1729 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1730 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1731 } 1732 } 1733 1734 // Generate an instruction for any remaining counter that needs 1735 // waiting for. 1736 1737 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 1738 unsigned Count = getWait(Wait, CT); 1739 if (Count == ~0u) 1740 continue; 1741 1742 [[maybe_unused]] auto SWaitInst = 1743 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) 1744 .addImm(Count); 1745 1746 Modified = true; 1747 1748 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1749 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1750 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1751 } 1752 1753 return Modified; 1754 } 1755 1756 static bool readsVCCZ(const MachineInstr &MI) { 1757 unsigned Opc = MI.getOpcode(); 1758 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 1759 !MI.getOperand(1).isUndef(); 1760 } 1761 1762 /// \returns true if the callee inserts an s_waitcnt 0 on function entry. 1763 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { 1764 // Currently all conventions wait, but this may not always be the case. 1765 // 1766 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make 1767 // senses to omit the wait and do it in the caller. 1768 return true; 1769 } 1770 1771 /// \returns true if the callee is expected to wait for any outstanding waits 1772 /// before returning. 1773 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; } 1774 1775 /// Generate s_waitcnt instruction to be placed before cur_Inst. 1776 /// Instructions of a given type are returned in order, 1777 /// but instructions of different types can complete out of order. 1778 /// We rely on this in-order completion 1779 /// and simply assign a score to the memory access instructions. 1780 /// We keep track of the active "score bracket" to determine 1781 /// if an access of a memory read requires an s_waitcnt 1782 /// and if so what the value of each counter is. 1783 /// The "score bracket" is bound by the lower bound and upper bound 1784 /// scores (*_score_LB and *_score_ub respectively). 1785 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to 1786 /// flush the vmcnt counter here. 1787 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, 1788 WaitcntBrackets &ScoreBrackets, 1789 MachineInstr *OldWaitcntInstr, 1790 bool FlushVmCnt) { 1791 setForceEmitWaitcnt(); 1792 1793 assert(!MI.isMetaInstruction()); 1794 1795 AMDGPU::Waitcnt Wait; 1796 1797 // FIXME: This should have already been handled by the memory legalizer. 1798 // Removing this currently doesn't affect any lit tests, but we need to 1799 // verify that nothing was relying on this. The number of buffer invalidates 1800 // being handled here should not be expanded. 1801 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 1802 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 1803 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || 1804 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || 1805 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { 1806 Wait.LoadCnt = 0; 1807 } 1808 1809 // All waits must be resolved at call return. 1810 // NOTE: this could be improved with knowledge of all call sites or 1811 // with knowledge of the called routines. 1812 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1813 MI.getOpcode() == AMDGPU::SI_RETURN || 1814 MI.getOpcode() == AMDGPU::S_SETPC_B64_return || 1815 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { 1816 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); 1817 } 1818 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits. 1819 // Technically the hardware will do this on its own if we don't, but that 1820 // might cost extra cycles compared to doing it explicitly. 1821 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may 1822 // have to wait for outstanding VMEM stores. In this case it can be useful to 1823 // send a message to explicitly release all VGPRs before the stores have 1824 // completed, but it is only safe to do this if there are no outstanding 1825 // scratch stores. 1826 else if (MI.getOpcode() == AMDGPU::S_ENDPGM || 1827 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { 1828 if (!WCG->isOptNone() && 1829 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() || 1830 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && 1831 ScoreBrackets.getScoreRange(STORE_CNT) != 0 && 1832 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)))) 1833 ReleaseVGPRInsts.insert(&MI); 1834 } 1835 // Resolve vm waits before gs-done. 1836 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 1837 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 1838 ST->hasLegacyGeometry() && 1839 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == 1840 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { 1841 Wait.LoadCnt = 0; 1842 } 1843 1844 // Export & GDS instructions do not read the EXEC mask until after the export 1845 // is granted (which can occur well after the instruction is issued). 1846 // The shader program must flush all EXP operations on the export-count 1847 // before overwriting the EXEC mask. 1848 else { 1849 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 1850 // Export and GDS are tracked individually, either may trigger a waitcnt 1851 // for EXEC. 1852 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || 1853 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || 1854 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || 1855 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { 1856 Wait.ExpCnt = 0; 1857 } 1858 } 1859 1860 // Wait for any pending GDS instruction to complete before any 1861 // "Always GDS" instruction. 1862 if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS()) 1863 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); 1864 1865 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { 1866 // The function is going to insert a wait on everything in its prolog. 1867 // This still needs to be careful if the call target is a load (e.g. a GOT 1868 // load). We also need to check WAW dependency with saved PC. 1869 Wait = AMDGPU::Waitcnt(); 1870 1871 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 1872 if (CallAddrOp.isReg()) { 1873 RegInterval CallAddrOpInterval = 1874 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp); 1875 1876 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval, 1877 Wait); 1878 1879 if (const auto *RtnAddrOp = 1880 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) { 1881 RegInterval RtnAddrOpInterval = 1882 ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp); 1883 1884 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval, 1885 Wait); 1886 } 1887 } 1888 } else { 1889 // FIXME: Should not be relying on memoperands. 1890 // Look at the source operands of every instruction to see if 1891 // any of them results from a previous memory operation that affects 1892 // its current usage. If so, an s_waitcnt instruction needs to be 1893 // emitted. 1894 // If the source operand was defined by a load, add the s_waitcnt 1895 // instruction. 1896 // 1897 // Two cases are handled for destination operands: 1898 // 1) If the destination operand was defined by a load, add the s_waitcnt 1899 // instruction to guarantee the right WAW order. 1900 // 2) If a destination operand that was used by a recent export/store ins, 1901 // add s_waitcnt on exp_cnt to guarantee the WAR order. 1902 1903 for (const MachineMemOperand *Memop : MI.memoperands()) { 1904 const Value *Ptr = Memop->getValue(); 1905 if (Memop->isStore()) { 1906 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) { 1907 addWait(Wait, SmemAccessCounter, 0); 1908 if (PDT->dominates(MI.getParent(), It->second)) 1909 SLoadAddresses.erase(It); 1910 } 1911 } 1912 unsigned AS = Memop->getAddrSpace(); 1913 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) 1914 continue; 1915 // No need to wait before load from VMEM to LDS. 1916 if (TII->mayWriteLDSThroughDMA(MI)) 1917 continue; 1918 1919 // LOAD_CNT is only relevant to vgpr or LDS. 1920 unsigned RegNo = FIRST_LDS_VGPR; 1921 // Only objects with alias scope info were added to LDSDMAScopes array. 1922 // In the absense of the scope info we will not be able to disambiguate 1923 // aliasing here. There is no need to try searching for a corresponding 1924 // store slot. This is conservatively correct because in that case we 1925 // will produce a wait using the first (general) LDS DMA wait slot which 1926 // will wait on all of them anyway. 1927 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { 1928 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); 1929 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { 1930 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) 1931 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); 1932 } 1933 } else { 1934 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); 1935 } 1936 if (Memop->isStore()) { 1937 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1938 } 1939 } 1940 1941 // Loop over use and def operands. 1942 for (const MachineOperand &Op : MI.operands()) { 1943 if (!Op.isReg()) 1944 continue; 1945 1946 // If the instruction does not read tied source, skip the operand. 1947 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) 1948 continue; 1949 1950 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op); 1951 1952 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); 1953 if (IsVGPR) { 1954 // Implicit VGPR defs and uses are never a part of the memory 1955 // instructions description and usually present to account for 1956 // super-register liveness. 1957 // TODO: Most of the other instructions also have implicit uses 1958 // for the liveness accounting only. 1959 if (Op.isImplicit() && MI.mayLoadOrStore()) 1960 continue; 1961 1962 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the 1963 // previous write and this write are the same type of VMEM 1964 // instruction, in which case they are (in some architectures) 1965 // guaranteed to write their results in order anyway. 1966 // Additionally check instructions where Point Sample Acceleration 1967 // might be applied. 1968 if (Op.isUse() || !updateVMCntOnly(MI) || 1969 ScoreBrackets.hasOtherPendingVmemTypes(Interval, 1970 getVmemType(MI)) || 1971 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) || 1972 !ST->hasVmemWriteVgprInOrder()) { 1973 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait); 1974 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait); 1975 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait); 1976 ScoreBrackets.clearVgprVmemTypes(Interval); 1977 } 1978 1979 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { 1980 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait); 1981 } 1982 ScoreBrackets.determineWait(DS_CNT, Interval, Wait); 1983 } else { 1984 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); 1985 } 1986 1987 if (hasXcnt() && Op.isDef()) 1988 ScoreBrackets.determineWait(X_CNT, Interval, Wait); 1989 } 1990 } 1991 } 1992 1993 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does 1994 // not, we need to ensure the subtarget is capable of backing off barrier 1995 // instructions in case there are any outstanding memory operations that may 1996 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. 1997 if (TII->isBarrierStart(MI.getOpcode()) && 1998 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { 1999 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); 2000 } 2001 2002 // TODO: Remove this work-around, enable the assert for Bug 457939 2003 // after fixing the scheduler. Also, the Shader Compiler code is 2004 // independent of target. 2005 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { 2006 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 2007 Wait.DsCnt = 0; 2008 } 2009 } 2010 2011 // Verify that the wait is actually needed. 2012 ScoreBrackets.simplifyWaitcnt(Wait); 2013 2014 // When forcing emit, we need to skip terminators because that would break the 2015 // terminators of the MBB if we emit a waitcnt between terminators. 2016 if (ForceEmitZeroFlag && !MI.isTerminator()) 2017 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); 2018 2019 if (ForceEmitWaitcnt[LOAD_CNT]) 2020 Wait.LoadCnt = 0; 2021 if (ForceEmitWaitcnt[EXP_CNT]) 2022 Wait.ExpCnt = 0; 2023 if (ForceEmitWaitcnt[DS_CNT]) 2024 Wait.DsCnt = 0; 2025 if (ForceEmitWaitcnt[SAMPLE_CNT]) 2026 Wait.SampleCnt = 0; 2027 if (ForceEmitWaitcnt[BVH_CNT]) 2028 Wait.BvhCnt = 0; 2029 if (ForceEmitWaitcnt[KM_CNT]) 2030 Wait.KmCnt = 0; 2031 if (ForceEmitWaitcnt[X_CNT]) 2032 Wait.XCnt = 0; 2033 2034 if (FlushVmCnt) { 2035 if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) 2036 Wait.LoadCnt = 0; 2037 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) 2038 Wait.SampleCnt = 0; 2039 if (ScoreBrackets.hasPendingEvent(BVH_CNT)) 2040 Wait.BvhCnt = 0; 2041 } 2042 2043 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u) 2044 Wait.LoadCnt = 0; 2045 2046 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, 2047 OldWaitcntInstr); 2048 } 2049 2050 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, 2051 MachineBasicBlock::instr_iterator It, 2052 MachineBasicBlock &Block, 2053 WaitcntBrackets &ScoreBrackets, 2054 MachineInstr *OldWaitcntInstr) { 2055 bool Modified = false; 2056 2057 if (OldWaitcntInstr) 2058 // Try to merge the required wait with preexisting waitcnt instructions. 2059 // Also erase redundant waitcnt. 2060 Modified = 2061 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); 2062 2063 // Any counts that could have been applied to any existing waitcnt 2064 // instructions will have been done so, now deal with any remaining. 2065 ScoreBrackets.applyWaitcnt(Wait); 2066 2067 // ExpCnt can be merged into VINTERP. 2068 if (Wait.ExpCnt != ~0u && It != Block.instr_end() && 2069 SIInstrInfo::isVINTERP(*It)) { 2070 MachineOperand *WaitExp = 2071 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); 2072 if (Wait.ExpCnt < WaitExp->getImm()) { 2073 WaitExp->setImm(Wait.ExpCnt); 2074 Modified = true; 2075 } 2076 Wait.ExpCnt = ~0u; 2077 2078 LLVM_DEBUG(dbgs() << "generateWaitcnt\n" 2079 << "Update Instr: " << *It); 2080 } 2081 2082 // XCnt may be already consumed by a load wait. 2083 if (Wait.KmCnt == 0 && Wait.XCnt != ~0u && 2084 !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) 2085 Wait.XCnt = ~0u; 2086 2087 if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u && 2088 !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) 2089 Wait.XCnt = ~0u; 2090 2091 // Since the translation for VMEM addresses occur in-order, we can skip the 2092 // XCnt if the current instruction is of VMEM type and has a memory dependency 2093 // with another VMEM instruction in flight. 2094 if (Wait.XCnt != ~0u && isVmemAccess(*It)) 2095 Wait.XCnt = ~0u; 2096 2097 if (WCG->createNewWaitcnt(Block, It, Wait)) 2098 Modified = true; 2099 2100 return Modified; 2101 } 2102 2103 // This is a flat memory operation. Check to see if it has memory tokens other 2104 // than LDS. Other address spaces supported by flat memory operations involve 2105 // global memory. 2106 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { 2107 assert(TII->isFLAT(MI)); 2108 2109 // All flat instructions use the VMEM counter. 2110 assert(TII->usesVM_CNT(MI)); 2111 2112 // If there are no memory operands then conservatively assume the flat 2113 // operation may access VMEM. 2114 if (MI.memoperands_empty()) 2115 return true; 2116 2117 // See if any memory operand specifies an address space that involves VMEM. 2118 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces 2119 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION 2120 // (GDS) address space is not supported by flat operations. Therefore, simply 2121 // return true unless only the LDS address space is found. 2122 for (const MachineMemOperand *Memop : MI.memoperands()) { 2123 unsigned AS = Memop->getAddrSpace(); 2124 assert(AS != AMDGPUAS::REGION_ADDRESS); 2125 if (AS != AMDGPUAS::LOCAL_ADDRESS) 2126 return true; 2127 } 2128 2129 return false; 2130 } 2131 2132 // This is a flat memory operation. Check to see if it has memory tokens for 2133 // either LDS or FLAT. 2134 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 2135 assert(TII->isFLAT(MI)); 2136 2137 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. 2138 if (!TII->usesLGKM_CNT(MI)) 2139 return false; 2140 2141 // If in tgsplit mode then there can be no use of LDS. 2142 if (ST->isTgSplitEnabled()) 2143 return false; 2144 2145 // If there are no memory operands then conservatively assume the flat 2146 // operation may access LDS. 2147 if (MI.memoperands_empty()) 2148 return true; 2149 2150 // See if any memory operand specifies an address space that involves LDS. 2151 for (const MachineMemOperand *Memop : MI.memoperands()) { 2152 unsigned AS = Memop->getAddrSpace(); 2153 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 2154 return true; 2155 } 2156 2157 return false; 2158 } 2159 2160 // This is a flat memory operation. Check to see if it has memory tokens for 2161 // either scratch or FLAT. 2162 bool SIInsertWaitcnts::mayAccessScratchThroughFlat( 2163 const MachineInstr &MI) const { 2164 assert(TII->isFLAT(MI)); 2165 2166 // SCRATCH instructions always access scratch. 2167 if (TII->isFLATScratch(MI)) 2168 return true; 2169 2170 // GLOBAL instructions never access scratch. 2171 if (TII->isFLATGlobal(MI)) 2172 return false; 2173 2174 // If there are no memory operands then conservatively assume the flat 2175 // operation may access scratch. 2176 if (MI.memoperands_empty()) 2177 return true; 2178 2179 // See if any memory operand specifies an address space that involves scratch. 2180 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { 2181 unsigned AS = Memop->getAddrSpace(); 2182 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 2183 }); 2184 } 2185 2186 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { 2187 return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || 2188 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); 2189 } 2190 2191 static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) { 2192 auto Opc = Inst.getOpcode(); 2193 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || 2194 Opc == AMDGPU::GLOBAL_WBINV; 2195 } 2196 2197 // Return true if the next instruction is S_ENDPGM, following fallthrough 2198 // blocks if necessary. 2199 bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It, 2200 MachineBasicBlock *Block) const { 2201 auto BlockEnd = Block->getParent()->end(); 2202 auto BlockIter = Block->getIterator(); 2203 2204 while (true) { 2205 if (It.isEnd()) { 2206 if (++BlockIter != BlockEnd) { 2207 It = BlockIter->instr_begin(); 2208 continue; 2209 } 2210 2211 return false; 2212 } 2213 2214 if (!It->isMetaInstruction()) 2215 break; 2216 2217 It++; 2218 } 2219 2220 assert(!It.isEnd()); 2221 2222 return It->getOpcode() == AMDGPU::S_ENDPGM; 2223 } 2224 2225 // Add a wait after an instruction if architecture requirements mandate one. 2226 bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst, 2227 MachineBasicBlock &Block, 2228 WaitcntBrackets &ScoreBrackets) { 2229 AMDGPU::Waitcnt Wait; 2230 bool NeedsEndPGMCheck = false; 2231 2232 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) 2233 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() && 2234 !SIInstrInfo::isAtomicRet(Inst)); 2235 2236 if (TII->isAlwaysGDS(Inst.getOpcode())) { 2237 Wait.DsCnt = 0; 2238 NeedsEndPGMCheck = true; 2239 } 2240 2241 ScoreBrackets.simplifyWaitcnt(Wait); 2242 2243 auto SuccessorIt = std::next(Inst.getIterator()); 2244 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets, 2245 /*OldWaitcntInstr=*/nullptr); 2246 2247 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) { 2248 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP)) 2249 .addImm(0); 2250 } 2251 2252 return Result; 2253 } 2254 2255 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, 2256 WaitcntBrackets *ScoreBrackets) { 2257 // Now look at the instruction opcode. If it is a memory access 2258 // instruction, update the upper-bound of the appropriate counter's 2259 // bracket and the destination operand scores. 2260 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. 2261 2262 bool IsVMEMAccess = false; 2263 bool IsSMEMAccess = false; 2264 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 2265 if (TII->isAlwaysGDS(Inst.getOpcode()) || 2266 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 2267 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 2268 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 2269 ScoreBrackets->setPendingGDS(); 2270 } else { 2271 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 2272 } 2273 } else if (TII->isFLAT(Inst)) { 2274 if (isGFX12CacheInvOrWBInst(Inst)) { 2275 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2276 Inst); 2277 return; 2278 } 2279 2280 assert(Inst.mayLoadOrStore()); 2281 2282 int FlatASCount = 0; 2283 2284 if (mayAccessVMEMThroughFlat(Inst)) { 2285 ++FlatASCount; 2286 IsVMEMAccess = true; 2287 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2288 Inst); 2289 } 2290 2291 if (mayAccessLDSThroughFlat(Inst)) { 2292 ++FlatASCount; 2293 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 2294 } 2295 2296 // A Flat memory operation must access at least one address space. 2297 assert(FlatASCount); 2298 2299 // This is a flat memory operation that access both VMEM and LDS, so note it 2300 // - it will require that both the VM and LGKM be flushed to zero if it is 2301 // pending when a VM or LGKM dependency occurs. 2302 if (FlatASCount > 1) 2303 ScoreBrackets->setPendingFlat(); 2304 } else if (SIInstrInfo::isVMEM(Inst) && 2305 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { 2306 IsVMEMAccess = true; 2307 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 2308 Inst); 2309 2310 if (ST->vmemWriteNeedsExpWaitcnt() && 2311 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { 2312 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 2313 } 2314 } else if (TII->isSMRD(Inst)) { 2315 IsSMEMAccess = true; 2316 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 2317 } else if (Inst.isCall()) { 2318 if (callWaitsOnFunctionReturn(Inst)) { 2319 // Act as a wait on everything 2320 ScoreBrackets->applyWaitcnt( 2321 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); 2322 ScoreBrackets->setStateOnFunctionEntryOrReturn(); 2323 } else { 2324 // May need to way wait for anything. 2325 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); 2326 } 2327 } else if (SIInstrInfo::isLDSDIR(Inst)) { 2328 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); 2329 } else if (TII->isVINTERP(Inst)) { 2330 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); 2331 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); 2332 } else if (SIInstrInfo::isEXP(Inst)) { 2333 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 2334 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) 2335 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 2336 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) 2337 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 2338 else 2339 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 2340 } else { 2341 switch (Inst.getOpcode()) { 2342 case AMDGPU::S_SENDMSG: 2343 case AMDGPU::S_SENDMSG_RTN_B32: 2344 case AMDGPU::S_SENDMSG_RTN_B64: 2345 case AMDGPU::S_SENDMSGHALT: 2346 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 2347 break; 2348 case AMDGPU::S_MEMTIME: 2349 case AMDGPU::S_MEMREALTIME: 2350 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: 2351 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: 2352 case AMDGPU::S_GET_BARRIER_STATE_M0: 2353 case AMDGPU::S_GET_BARRIER_STATE_IMM: 2354 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 2355 break; 2356 } 2357 } 2358 2359 if (!hasXcnt()) 2360 return; 2361 2362 if (IsVMEMAccess) 2363 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst); 2364 2365 if (IsSMEMAccess) 2366 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst); 2367 } 2368 2369 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, 2370 unsigned OtherScore) { 2371 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; 2372 unsigned OtherShifted = 2373 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; 2374 Score = std::max(MyShifted, OtherShifted); 2375 return OtherShifted > MyShifted; 2376 } 2377 2378 /// Merge the pending events and associater score brackets of \p Other into 2379 /// this brackets status. 2380 /// 2381 /// Returns whether the merge resulted in a change that requires tighter waits 2382 /// (i.e. the merged brackets strictly dominate the original brackets). 2383 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { 2384 bool StrictDom = false; 2385 2386 VgprUB = std::max(VgprUB, Other.VgprUB); 2387 SgprUB = std::max(SgprUB, Other.SgprUB); 2388 2389 for (auto T : inst_counter_types(MaxCounter)) { 2390 // Merge event flags for this counter 2391 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; 2392 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; 2393 if (OtherEvents & ~OldEvents) 2394 StrictDom = true; 2395 PendingEvents |= OtherEvents; 2396 2397 // Merge scores for this counter 2398 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; 2399 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; 2400 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); 2401 if (NewUB < ScoreLBs[T]) 2402 report_fatal_error("waitcnt score overflow"); 2403 2404 MergeInfo M; 2405 M.OldLB = ScoreLBs[T]; 2406 M.OtherLB = Other.ScoreLBs[T]; 2407 M.MyShift = NewUB - ScoreUBs[T]; 2408 M.OtherShift = NewUB - Other.ScoreUBs[T]; 2409 2410 ScoreUBs[T] = NewUB; 2411 2412 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); 2413 2414 if (T == DS_CNT) 2415 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS); 2416 2417 for (int J = 0; J <= VgprUB; J++) 2418 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); 2419 2420 if (isSmemCounter(T)) { 2421 unsigned Idx = getSgprScoresIdx(T); 2422 for (int J = 0; J <= SgprUB; J++) 2423 StrictDom |= 2424 mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]); 2425 } 2426 } 2427 2428 for (int J = 0; J <= VgprUB; J++) { 2429 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; 2430 StrictDom |= NewVmemTypes != VgprVmemTypes[J]; 2431 VgprVmemTypes[J] = NewVmemTypes; 2432 } 2433 2434 return StrictDom; 2435 } 2436 2437 static bool isWaitInstr(MachineInstr &Inst) { 2438 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode()); 2439 return Opcode == AMDGPU::S_WAITCNT || 2440 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && 2441 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || 2442 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || 2443 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || 2444 counterTypeForInstr(Opcode).has_value(); 2445 } 2446 2447 // Generate s_waitcnt instructions where needed. 2448 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 2449 MachineBasicBlock &Block, 2450 WaitcntBrackets &ScoreBrackets) { 2451 bool Modified = false; 2452 2453 LLVM_DEBUG({ 2454 dbgs() << "*** Begin Block: "; 2455 Block.printName(dbgs()); 2456 ScoreBrackets.dump(); 2457 }); 2458 2459 // Track the correctness of vccz through this basic block. There are two 2460 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and 2461 // ST->partialVCCWritesUpdateVCCZ(). 2462 bool VCCZCorrect = true; 2463 if (ST->hasReadVCCZBug()) { 2464 // vccz could be incorrect at a basic block boundary if a predecessor wrote 2465 // to vcc and then issued an smem load. 2466 VCCZCorrect = false; 2467 } else if (!ST->partialVCCWritesUpdateVCCZ()) { 2468 // vccz could be incorrect at a basic block boundary if a predecessor wrote 2469 // to vcc_lo or vcc_hi. 2470 VCCZCorrect = false; 2471 } 2472 2473 // Walk over the instructions. 2474 MachineInstr *OldWaitcntInstr = nullptr; 2475 2476 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), 2477 E = Block.instr_end(); 2478 Iter != E;) { 2479 MachineInstr &Inst = *Iter; 2480 if (Inst.isMetaInstruction()) { 2481 ++Iter; 2482 continue; 2483 } 2484 2485 // Track pre-existing waitcnts that were added in earlier iterations or by 2486 // the memory legalizer. 2487 if (isWaitInstr(Inst)) { 2488 if (!OldWaitcntInstr) 2489 OldWaitcntInstr = &Inst; 2490 ++Iter; 2491 continue; 2492 } 2493 2494 bool FlushVmCnt = Block.getFirstTerminator() == Inst && 2495 isPreheaderToFlush(Block, ScoreBrackets); 2496 2497 // Generate an s_waitcnt instruction to be placed before Inst, if needed. 2498 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, 2499 FlushVmCnt); 2500 OldWaitcntInstr = nullptr; 2501 2502 // Restore vccz if it's not known to be correct already. 2503 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); 2504 2505 // Don't examine operands unless we need to track vccz correctness. 2506 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { 2507 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) || 2508 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) { 2509 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. 2510 if (!ST->partialVCCWritesUpdateVCCZ()) 2511 VCCZCorrect = false; 2512 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) { 2513 // There is a hardware bug on CI/SI where SMRD instruction may corrupt 2514 // vccz bit, so when we detect that an instruction may read from a 2515 // corrupt vccz bit, we need to: 2516 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD 2517 // operations to complete. 2518 // 2. Restore the correct value of vccz by writing the current value 2519 // of vcc back to vcc. 2520 if (ST->hasReadVCCZBug() && 2521 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 2522 // Writes to vcc while there's an outstanding smem read may get 2523 // clobbered as soon as any read completes. 2524 VCCZCorrect = false; 2525 } else { 2526 // Writes to vcc will fix any incorrect value in vccz. 2527 VCCZCorrect = true; 2528 } 2529 } 2530 } 2531 2532 if (TII->isSMRD(Inst)) { 2533 for (const MachineMemOperand *Memop : Inst.memoperands()) { 2534 // No need to handle invariant loads when avoiding WAR conflicts, as 2535 // there cannot be a vector store to the same memory location. 2536 if (!Memop->isInvariant()) { 2537 const Value *Ptr = Memop->getValue(); 2538 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent())); 2539 } 2540 } 2541 if (ST->hasReadVCCZBug()) { 2542 // This smem read could complete and clobber vccz at any time. 2543 VCCZCorrect = false; 2544 } 2545 } 2546 2547 updateEventWaitcntAfter(Inst, &ScoreBrackets); 2548 2549 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets); 2550 2551 LLVM_DEBUG({ 2552 Inst.print(dbgs()); 2553 ScoreBrackets.dump(); 2554 }); 2555 2556 // TODO: Remove this work-around after fixing the scheduler and enable the 2557 // assert above. 2558 if (RestoreVCCZ) { 2559 // Restore the vccz bit. Any time a value is written to vcc, the vcc 2560 // bit is updated, so we can restore the bit by reading the value of 2561 // vcc and then writing it back to the register. 2562 BuildMI(Block, Inst, Inst.getDebugLoc(), 2563 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 2564 TRI->getVCC()) 2565 .addReg(TRI->getVCC()); 2566 VCCZCorrect = true; 2567 Modified = true; 2568 } 2569 2570 ++Iter; 2571 } 2572 2573 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if 2574 // needed. 2575 AMDGPU::Waitcnt Wait; 2576 if (Block.getFirstTerminator() == Block.end() && 2577 isPreheaderToFlush(Block, ScoreBrackets)) { 2578 if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) 2579 Wait.LoadCnt = 0; 2580 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) 2581 Wait.SampleCnt = 0; 2582 if (ScoreBrackets.hasPendingEvent(BVH_CNT)) 2583 Wait.BvhCnt = 0; 2584 } 2585 2586 // Combine or remove any redundant waitcnts at the end of the block. 2587 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, 2588 OldWaitcntInstr); 2589 2590 LLVM_DEBUG({ 2591 dbgs() << "*** End Block: "; 2592 Block.printName(dbgs()); 2593 ScoreBrackets.dump(); 2594 }); 2595 2596 return Modified; 2597 } 2598 2599 // Return true if the given machine basic block is a preheader of a loop in 2600 // which we want to flush the vmcnt counter, and false otherwise. 2601 bool SIInsertWaitcnts::isPreheaderToFlush( 2602 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) { 2603 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); 2604 if (!IsInserted) 2605 return Iterator->second; 2606 2607 MachineBasicBlock *Succ = MBB.getSingleSuccessor(); 2608 if (!Succ) 2609 return false; 2610 2611 MachineLoop *Loop = MLI->getLoopFor(Succ); 2612 if (!Loop) 2613 return false; 2614 2615 if (Loop->getLoopPreheader() == &MBB && 2616 shouldFlushVmCnt(Loop, ScoreBrackets)) { 2617 Iterator->second = true; 2618 return true; 2619 } 2620 2621 return false; 2622 } 2623 2624 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { 2625 if (SIInstrInfo::isFLAT(MI)) 2626 return mayAccessVMEMThroughFlat(MI); 2627 return SIInstrInfo::isVMEM(MI); 2628 } 2629 2630 // Return true if it is better to flush the vmcnt counter in the preheader of 2631 // the given loop. We currently decide to flush in two situations: 2632 // 1. The loop contains vmem store(s), no vmem load and at least one use of a 2633 // vgpr containing a value that is loaded outside of the loop. (Only on 2634 // targets with no vscnt counter). 2635 // 2. The loop contains vmem load(s), but the loaded values are not used in the 2636 // loop, and at least one use of a vgpr containing a value that is loaded 2637 // outside of the loop. 2638 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, 2639 const WaitcntBrackets &Brackets) { 2640 bool HasVMemLoad = false; 2641 bool HasVMemStore = false; 2642 bool UsesVgprLoadedOutside = false; 2643 DenseSet<Register> VgprUse; 2644 DenseSet<Register> VgprDef; 2645 2646 for (MachineBasicBlock *MBB : ML->blocks()) { 2647 for (MachineInstr &MI : *MBB) { 2648 if (isVMEMOrFlatVMEM(MI)) { 2649 if (MI.mayLoad()) 2650 HasVMemLoad = true; 2651 if (MI.mayStore()) 2652 HasVMemStore = true; 2653 } 2654 for (const MachineOperand &Op : MI.all_uses()) { 2655 if (!TRI->isVectorRegister(*MRI, Op.getReg())) 2656 continue; 2657 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op); 2658 // Vgpr use 2659 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 2660 // If we find a register that is loaded inside the loop, 1. and 2. 2661 // are invalidated and we can exit. 2662 if (VgprDef.contains(RegNo)) 2663 return false; 2664 VgprUse.insert(RegNo); 2665 // If at least one of Op's registers is in the score brackets, the 2666 // value is likely loaded outside of the loop. 2667 if (Brackets.getRegScore(RegNo, LOAD_CNT) > 2668 Brackets.getScoreLB(LOAD_CNT) || 2669 Brackets.getRegScore(RegNo, SAMPLE_CNT) > 2670 Brackets.getScoreLB(SAMPLE_CNT) || 2671 Brackets.getRegScore(RegNo, BVH_CNT) > 2672 Brackets.getScoreLB(BVH_CNT)) { 2673 UsesVgprLoadedOutside = true; 2674 break; 2675 } 2676 } 2677 } 2678 2679 // VMem load vgpr def 2680 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) { 2681 for (const MachineOperand &Op : MI.all_defs()) { 2682 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op); 2683 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 2684 // If we find a register that is loaded inside the loop, 1. and 2. 2685 // are invalidated and we can exit. 2686 if (VgprUse.contains(RegNo)) 2687 return false; 2688 VgprDef.insert(RegNo); 2689 } 2690 } 2691 } 2692 } 2693 } 2694 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) 2695 return true; 2696 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); 2697 } 2698 2699 bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) { 2700 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); 2701 auto *PDT = 2702 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 2703 AliasAnalysis *AA = nullptr; 2704 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) 2705 AA = &AAR->getAAResults(); 2706 2707 return SIInsertWaitcnts(MLI, PDT, AA).run(MF); 2708 } 2709 2710 PreservedAnalyses 2711 SIInsertWaitcntsPass::run(MachineFunction &MF, 2712 MachineFunctionAnalysisManager &MFAM) { 2713 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF); 2714 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF); 2715 auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF) 2716 .getManager() 2717 .getCachedResult<AAManager>(MF.getFunction()); 2718 2719 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF)) 2720 return PreservedAnalyses::all(); 2721 2722 return getMachineFunctionPassPreservedAnalyses() 2723 .preserveSet<CFGAnalyses>() 2724 .preserve<AAManager>(); 2725 } 2726 2727 bool SIInsertWaitcnts::run(MachineFunction &MF) { 2728 ST = &MF.getSubtarget<GCNSubtarget>(); 2729 TII = ST->getInstrInfo(); 2730 TRI = &TII->getRegisterInfo(); 2731 MRI = &MF.getRegInfo(); 2732 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2733 2734 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); 2735 2736 if (ST->hasExtendedWaitCounts()) { 2737 MaxCounter = NUM_EXTENDED_INST_CNTS; 2738 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter); 2739 WCG = &WCGGFX12Plus; 2740 } else { 2741 MaxCounter = NUM_NORMAL_INST_CNTS; 2742 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF); 2743 WCG = &WCGPreGFX12; 2744 } 2745 2746 for (auto T : inst_counter_types()) 2747 ForceEmitWaitcnt[T] = false; 2748 2749 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); 2750 2751 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); 2752 2753 HardwareLimits Limits = {}; 2754 if (ST->hasExtendedWaitCounts()) { 2755 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); 2756 Limits.DscntMax = AMDGPU::getDscntBitMask(IV); 2757 } else { 2758 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); 2759 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); 2760 } 2761 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 2762 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); 2763 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); 2764 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); 2765 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); 2766 Limits.XcntMax = AMDGPU::getXcntBitMask(IV); 2767 2768 [[maybe_unused]] unsigned NumVGPRsMax = 2769 ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()); 2770 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); 2771 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 2772 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 2773 2774 BlockInfos.clear(); 2775 bool Modified = false; 2776 2777 MachineBasicBlock &EntryBB = MF.front(); 2778 MachineBasicBlock::iterator I = EntryBB.begin(); 2779 2780 if (!MFI->isEntryFunction()) { 2781 // Wait for any outstanding memory operations that the input registers may 2782 // depend on. We can't track them and it's better to do the wait after the 2783 // costly call sequence. 2784 2785 // TODO: Could insert earlier and schedule more liberally with operations 2786 // that only use caller preserved registers. 2787 for (MachineBasicBlock::iterator E = EntryBB.end(); 2788 I != E && (I->isPHI() || I->isMetaInstruction()); ++I) 2789 ; 2790 2791 if (ST->hasExtendedWaitCounts()) { 2792 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) 2793 .addImm(0); 2794 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { 2795 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT) 2796 continue; 2797 2798 if (!ST->hasImageInsts() && 2799 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT)) 2800 continue; 2801 2802 BuildMI(EntryBB, I, DebugLoc(), 2803 TII->get(instrsForExtendedCounterTypes[CT])) 2804 .addImm(0); 2805 } 2806 } else { 2807 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); 2808 } 2809 2810 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( 2811 ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); 2812 NonKernelInitialState->setStateOnFunctionEntryOrReturn(); 2813 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); 2814 2815 Modified = true; 2816 } 2817 2818 // Keep iterating over the blocks in reverse post order, inserting and 2819 // updating s_waitcnt where needed, until a fix point is reached. 2820 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) 2821 BlockInfos.try_emplace(MBB); 2822 2823 std::unique_ptr<WaitcntBrackets> Brackets; 2824 bool Repeat; 2825 do { 2826 Repeat = false; 2827 2828 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; 2829 ++BII) { 2830 MachineBasicBlock *MBB = BII->first; 2831 BlockInfo &BI = BII->second; 2832 if (!BI.Dirty) 2833 continue; 2834 2835 if (BI.Incoming) { 2836 if (!Brackets) 2837 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); 2838 else 2839 *Brackets = *BI.Incoming; 2840 } else { 2841 if (!Brackets) { 2842 Brackets = std::make_unique<WaitcntBrackets>( 2843 ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); 2844 } else { 2845 // Reinitialize in-place. N.B. do not do this by assigning from a 2846 // temporary because the WaitcntBrackets class is large and it could 2847 // cause this function to use an unreasonable amount of stack space. 2848 Brackets->~WaitcntBrackets(); 2849 new (Brackets.get()) WaitcntBrackets( 2850 ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); 2851 } 2852 } 2853 2854 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); 2855 BI.Dirty = false; 2856 2857 if (Brackets->hasPendingEvent()) { 2858 BlockInfo *MoveBracketsToSucc = nullptr; 2859 for (MachineBasicBlock *Succ : MBB->successors()) { 2860 auto *SuccBII = BlockInfos.find(Succ); 2861 BlockInfo &SuccBI = SuccBII->second; 2862 if (!SuccBI.Incoming) { 2863 SuccBI.Dirty = true; 2864 if (SuccBII <= BII) { 2865 LLVM_DEBUG(dbgs() << "repeat on backedge\n"); 2866 Repeat = true; 2867 } 2868 if (!MoveBracketsToSucc) { 2869 MoveBracketsToSucc = &SuccBI; 2870 } else { 2871 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); 2872 } 2873 } else if (SuccBI.Incoming->merge(*Brackets)) { 2874 SuccBI.Dirty = true; 2875 if (SuccBII <= BII) { 2876 LLVM_DEBUG(dbgs() << "repeat on backedge\n"); 2877 Repeat = true; 2878 } 2879 } 2880 } 2881 if (MoveBracketsToSucc) 2882 MoveBracketsToSucc->Incoming = std::move(Brackets); 2883 } 2884 } 2885 } while (Repeat); 2886 2887 if (ST->hasScalarStores()) { 2888 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 2889 bool HaveScalarStores = false; 2890 2891 for (MachineBasicBlock &MBB : MF) { 2892 for (MachineInstr &MI : MBB) { 2893 if (!HaveScalarStores && TII->isScalarStore(MI)) 2894 HaveScalarStores = true; 2895 2896 if (MI.getOpcode() == AMDGPU::S_ENDPGM || 2897 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 2898 EndPgmBlocks.push_back(&MBB); 2899 } 2900 } 2901 2902 if (HaveScalarStores) { 2903 // If scalar writes are used, the cache must be flushed or else the next 2904 // wave to reuse the same scratch memory can be clobbered. 2905 // 2906 // Insert s_dcache_wb at wave termination points if there were any scalar 2907 // stores, and only if the cache hasn't already been flushed. This could 2908 // be improved by looking across blocks for flushes in postdominating 2909 // blocks from the stores but an explicitly requested flush is probably 2910 // very rare. 2911 for (MachineBasicBlock *MBB : EndPgmBlocks) { 2912 bool SeenDCacheWB = false; 2913 2914 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 2915 I != E; ++I) { 2916 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 2917 SeenDCacheWB = true; 2918 else if (TII->isScalarStore(*I)) 2919 SeenDCacheWB = false; 2920 2921 // FIXME: It would be better to insert this before a waitcnt if any. 2922 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 2923 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 2924 !SeenDCacheWB) { 2925 Modified = true; 2926 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 2927 } 2928 } 2929 } 2930 } 2931 } 2932 2933 // Deallocate the VGPRs before previously identified S_ENDPGM instructions. 2934 // This is done in different ways depending on how the VGPRs were allocated 2935 // (i.e. whether we're in dynamic VGPR mode or not). 2936 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short 2937 // waveslot limited kernel runs slower with the deallocation. 2938 if (MFI->isDynamicVGPREnabled()) { 2939 for (MachineInstr *MI : ReleaseVGPRInsts) { 2940 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 2941 TII->get(AMDGPU::S_ALLOC_VGPR)) 2942 .addImm(0); 2943 Modified = true; 2944 } 2945 } else { 2946 if (!ReleaseVGPRInsts.empty() && 2947 (MF.getFrameInfo().hasCalls() || 2948 ST->getOccupancyWithNumVGPRs( 2949 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass), 2950 /*IsDynamicVGPR=*/false) < 2951 AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { 2952 for (MachineInstr *MI : ReleaseVGPRInsts) { 2953 if (ST->requiresNopBeforeDeallocVGPRs()) { 2954 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 2955 TII->get(AMDGPU::S_NOP)) 2956 .addImm(0); 2957 } 2958 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 2959 TII->get(AMDGPU::S_SENDMSG)) 2960 .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); 2961 Modified = true; 2962 } 2963 } 2964 } 2965 ReleaseVGPRInsts.clear(); 2966 PreheadersToFlush.clear(); 2967 SLoadAddresses.clear(); 2968 2969 return Modified; 2970 } 2971