1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Insert wait instructions for memory reads and writes. 11 /// 12 /// Memory reads and writes are issued asynchronously, so we need to insert 13 /// S_WAITCNT instructions when we want to access any of their results or 14 /// overwrite any register that's used asynchronously. 15 /// 16 /// TODO: This pass currently keeps one timeline per hardware counter. A more 17 /// finely-grained approach that keeps one timeline per event type could 18 /// sometimes get away with generating weaker s_waitcnt instructions. For 19 /// example, when both SMEM and LDS are in flight and we need to wait for 20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, 21 /// but the pass will currently generate a conservative lgkmcnt(0) because 22 /// multiple event types are in flight. 23 // 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPU.h" 27 #include "GCNSubtarget.h" 28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "llvm/ADT/MapVector.h" 32 #include "llvm/ADT/PostOrderIterator.h" 33 #include "llvm/ADT/Sequence.h" 34 #include "llvm/CodeGen/MachineLoopInfo.h" 35 #include "llvm/CodeGen/MachinePostDominators.h" 36 #include "llvm/InitializePasses.h" 37 #include "llvm/Support/DebugCounter.h" 38 #include "llvm/TargetParser/TargetParser.h" 39 using namespace llvm; 40 41 #define DEBUG_TYPE "si-insert-waitcnts" 42 43 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", 44 "Force emit s_waitcnt expcnt(0) instrs"); 45 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", 46 "Force emit s_waitcnt lgkmcnt(0) instrs"); 47 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", 48 "Force emit s_waitcnt vmcnt(0) instrs"); 49 50 static cl::opt<bool> ForceEmitZeroFlag( 51 "amdgpu-waitcnt-forcezero", 52 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), 53 cl::init(false), cl::Hidden); 54 55 namespace { 56 // Class of object that encapsulates latest instruction counter score 57 // associated with the operand. Used for determining whether 58 // s_waitcnt instruction needs to be emitted. 59 60 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; 61 } // namespace 62 63 namespace llvm { 64 template <> struct enum_iteration_traits<InstCounterType> { 65 static constexpr bool is_iterable = true; 66 }; 67 } // namespace llvm 68 69 namespace { 70 auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); } 71 72 using RegInterval = std::pair<int, int>; 73 74 struct HardwareLimits { 75 unsigned VmcntMax; 76 unsigned ExpcntMax; 77 unsigned LgkmcntMax; 78 unsigned VscntMax; 79 }; 80 81 struct RegisterEncoding { 82 unsigned VGPR0; 83 unsigned VGPRL; 84 unsigned SGPR0; 85 unsigned SGPRL; 86 }; 87 88 enum WaitEventType { 89 VMEM_ACCESS, // vector-memory read & write 90 VMEM_READ_ACCESS, // vector-memory read 91 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch 92 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch 93 LDS_ACCESS, // lds read & write 94 GDS_ACCESS, // gds read & write 95 SQ_MESSAGE, // send message 96 SMEM_ACCESS, // scalar-memory read & write 97 EXP_GPR_LOCK, // export holding on its data src 98 GDS_GPR_LOCK, // GDS holding on its data and addr src 99 EXP_POS_ACCESS, // write to export position 100 EXP_PARAM_ACCESS, // write to export parameter 101 VMW_GPR_LOCK, // vector-memory write holding on its data src 102 EXP_LDS_ACCESS, // read by ldsdir counting as export 103 NUM_WAIT_EVENTS, 104 }; 105 106 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { 107 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), 108 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | 109 (1 << SQ_MESSAGE), 110 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | 111 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), 112 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)}; 113 114 // The mapping is: 115 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs 116 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots 117 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs 118 // We reserve a fixed number of VGPR slots in the scoring tables for 119 // special tokens like SCMEM_LDS (needed for buffer load to LDS). 120 enum RegisterMapping { 121 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. 122 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. 123 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. 124 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. 125 EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes. 126 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. 127 }; 128 129 // Enumerate different types of result-returning VMEM operations. Although 130 // s_waitcnt orders them all with a single vmcnt counter, in the absence of 131 // s_waitcnt only instructions of the same VmemType are guaranteed to write 132 // their results in order -- so there is no need to insert an s_waitcnt between 133 // two instructions of the same type that write the same vgpr. 134 enum VmemType { 135 // BUF instructions and MIMG instructions without a sampler. 136 VMEM_NOSAMPLER, 137 // MIMG instructions with a sampler. 138 VMEM_SAMPLER, 139 // BVH instructions 140 VMEM_BVH 141 }; 142 143 static bool updateVMCntOnly(const MachineInstr &Inst) { 144 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || 145 SIInstrInfo::isFLATScratch(Inst); 146 } 147 148 VmemType getVmemType(const MachineInstr &Inst) { 149 assert(updateVMCntOnly(Inst)); 150 if (!SIInstrInfo::isMIMG(Inst)) 151 return VMEM_NOSAMPLER; 152 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); 153 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = 154 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 155 return BaseInfo->BVH ? VMEM_BVH 156 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; 157 } 158 159 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { 160 switch (T) { 161 case VM_CNT: 162 Wait.VmCnt = std::min(Wait.VmCnt, Count); 163 break; 164 case EXP_CNT: 165 Wait.ExpCnt = std::min(Wait.ExpCnt, Count); 166 break; 167 case LGKM_CNT: 168 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); 169 break; 170 case VS_CNT: 171 Wait.VsCnt = std::min(Wait.VsCnt, Count); 172 break; 173 default: 174 llvm_unreachable("bad InstCounterType"); 175 } 176 } 177 178 // This objects maintains the current score brackets of each wait counter, and 179 // a per-register scoreboard for each wait counter. 180 // 181 // We also maintain the latest score for every event type that can change the 182 // waitcnt in order to know if there are multiple types of events within 183 // the brackets. When multiple types of event happen in the bracket, 184 // wait count may get decreased out of order, therefore we need to put in 185 // "s_waitcnt 0" before use. 186 class WaitcntBrackets { 187 public: 188 WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits, 189 RegisterEncoding Encoding) 190 : ST(SubTarget), Limits(Limits), Encoding(Encoding) {} 191 192 unsigned getWaitCountMax(InstCounterType T) const { 193 switch (T) { 194 case VM_CNT: 195 return Limits.VmcntMax; 196 case LGKM_CNT: 197 return Limits.LgkmcntMax; 198 case EXP_CNT: 199 return Limits.ExpcntMax; 200 case VS_CNT: 201 return Limits.VscntMax; 202 default: 203 break; 204 } 205 return 0; 206 } 207 208 unsigned getScoreLB(InstCounterType T) const { 209 assert(T < NUM_INST_CNTS); 210 return ScoreLBs[T]; 211 } 212 213 unsigned getScoreUB(InstCounterType T) const { 214 assert(T < NUM_INST_CNTS); 215 return ScoreUBs[T]; 216 } 217 218 unsigned getScoreRange(InstCounterType T) const { 219 return getScoreUB(T) - getScoreLB(T); 220 } 221 222 // Mapping from event to counter. 223 InstCounterType eventCounter(WaitEventType E) const { 224 for (auto T : inst_counter_types()) { 225 if (WaitEventMaskForInst[T] & (1 << E)) 226 return T; 227 } 228 llvm_unreachable("event type has no associated counter"); 229 } 230 231 unsigned getRegScore(int GprNo, InstCounterType T) const { 232 if (GprNo < NUM_ALL_VGPRS) { 233 return VgprScores[T][GprNo]; 234 } 235 assert(T == LGKM_CNT); 236 return SgprScores[GprNo - NUM_ALL_VGPRS]; 237 } 238 239 bool merge(const WaitcntBrackets &Other); 240 241 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, 242 const MachineRegisterInfo *MRI, 243 const SIRegisterInfo *TRI, unsigned OpNo) const; 244 245 bool counterOutOfOrder(InstCounterType T) const; 246 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; 247 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; 248 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; 249 void applyWaitcnt(const AMDGPU::Waitcnt &Wait); 250 void applyWaitcnt(InstCounterType T, unsigned Count); 251 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, 252 const MachineRegisterInfo *MRI, WaitEventType E, 253 MachineInstr &MI); 254 255 unsigned hasPendingEvent() const { return PendingEvents; } 256 unsigned hasPendingEvent(WaitEventType E) const { 257 return PendingEvents & (1 << E); 258 } 259 unsigned hasPendingEvent(InstCounterType T) const { 260 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; 261 assert((HasPending != 0) == (getScoreRange(T) != 0)); 262 return HasPending; 263 } 264 265 bool hasMixedPendingEvents(InstCounterType T) const { 266 unsigned Events = hasPendingEvent(T); 267 // Return true if more than one bit is set in Events. 268 return Events & (Events - 1); 269 } 270 271 bool hasPendingFlat() const { 272 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && 273 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || 274 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && 275 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); 276 } 277 278 void setPendingFlat() { 279 LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; 280 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; 281 } 282 283 // Return true if there might be pending writes to the specified vgpr by VMEM 284 // instructions with types different from V. 285 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { 286 assert(GprNo < NUM_ALL_VGPRS); 287 return VgprVmemTypes[GprNo] & ~(1 << V); 288 } 289 290 void clearVgprVmemTypes(int GprNo) { 291 assert(GprNo < NUM_ALL_VGPRS); 292 VgprVmemTypes[GprNo] = 0; 293 } 294 295 void setNonKernelFunctionInitialState() { 296 setScoreUB(VS_CNT, getWaitCountMax(VS_CNT)); 297 PendingEvents |= WaitEventMaskForInst[VS_CNT]; 298 } 299 300 void print(raw_ostream &); 301 void dump() { print(dbgs()); } 302 303 private: 304 struct MergeInfo { 305 unsigned OldLB; 306 unsigned OtherLB; 307 unsigned MyShift; 308 unsigned OtherShift; 309 }; 310 static bool mergeScore(const MergeInfo &M, unsigned &Score, 311 unsigned OtherScore); 312 313 void setScoreLB(InstCounterType T, unsigned Val) { 314 assert(T < NUM_INST_CNTS); 315 ScoreLBs[T] = Val; 316 } 317 318 void setScoreUB(InstCounterType T, unsigned Val) { 319 assert(T < NUM_INST_CNTS); 320 ScoreUBs[T] = Val; 321 322 if (T != EXP_CNT) 323 return; 324 325 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) 326 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); 327 } 328 329 void setRegScore(int GprNo, InstCounterType T, unsigned Val) { 330 if (GprNo < NUM_ALL_VGPRS) { 331 VgprUB = std::max(VgprUB, GprNo); 332 VgprScores[T][GprNo] = Val; 333 } else { 334 assert(T == LGKM_CNT); 335 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); 336 SgprScores[GprNo - NUM_ALL_VGPRS] = Val; 337 } 338 } 339 340 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, 341 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, 342 unsigned OpNo, unsigned Val); 343 344 const GCNSubtarget *ST = nullptr; 345 HardwareLimits Limits = {}; 346 RegisterEncoding Encoding = {}; 347 unsigned ScoreLBs[NUM_INST_CNTS] = {0}; 348 unsigned ScoreUBs[NUM_INST_CNTS] = {0}; 349 unsigned PendingEvents = 0; 350 // Remember the last flat memory operation. 351 unsigned LastFlat[NUM_INST_CNTS] = {0}; 352 // wait_cnt scores for every vgpr. 353 // Keep track of the VgprUB and SgprUB to make merge at join efficient. 354 int VgprUB = -1; 355 int SgprUB = -1; 356 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; 357 // Wait cnt scores for every sgpr, only lgkmcnt is relevant. 358 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; 359 // Bitmask of the VmemTypes of VMEM instructions that might have a pending 360 // write to each vgpr. 361 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; 362 }; 363 364 class SIInsertWaitcnts : public MachineFunctionPass { 365 private: 366 const GCNSubtarget *ST = nullptr; 367 const SIInstrInfo *TII = nullptr; 368 const SIRegisterInfo *TRI = nullptr; 369 const MachineRegisterInfo *MRI = nullptr; 370 AMDGPU::IsaVersion IV; 371 372 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; 373 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; 374 MachineLoopInfo *MLI; 375 MachinePostDominatorTree *PDT; 376 377 struct BlockInfo { 378 std::unique_ptr<WaitcntBrackets> Incoming; 379 bool Dirty = true; 380 }; 381 382 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; 383 384 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 385 // because of amdgpu-waitcnt-forcezero flag 386 bool ForceEmitZeroWaitcnts; 387 bool ForceEmitWaitcnt[NUM_INST_CNTS]; 388 389 bool OptNone; 390 391 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS 392 // message. 393 DenseSet<MachineInstr *> ReleaseVGPRInsts; 394 395 public: 396 static char ID; 397 398 SIInsertWaitcnts() : MachineFunctionPass(ID) { 399 (void)ForceExpCounter; 400 (void)ForceLgkmCounter; 401 (void)ForceVMCounter; 402 } 403 404 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); 405 bool isPreheaderToFlush(MachineBasicBlock &MBB, 406 WaitcntBrackets &ScoreBrackets); 407 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; 408 bool runOnMachineFunction(MachineFunction &MF) override; 409 410 StringRef getPassName() const override { 411 return "SI insert wait instructions"; 412 } 413 414 void getAnalysisUsage(AnalysisUsage &AU) const override { 415 AU.setPreservesCFG(); 416 AU.addRequired<MachineLoopInfo>(); 417 AU.addRequired<MachinePostDominatorTree>(); 418 MachineFunctionPass::getAnalysisUsage(AU); 419 } 420 421 bool isForceEmitWaitcnt() const { 422 for (auto T : inst_counter_types()) 423 if (ForceEmitWaitcnt[T]) 424 return true; 425 return false; 426 } 427 428 void setForceEmitWaitcnt() { 429 // For non-debug builds, ForceEmitWaitcnt has been initialized to false; 430 // For debug builds, get the debug counter info and adjust if need be 431 #ifndef NDEBUG 432 if (DebugCounter::isCounterSet(ForceExpCounter) && 433 DebugCounter::shouldExecute(ForceExpCounter)) { 434 ForceEmitWaitcnt[EXP_CNT] = true; 435 } else { 436 ForceEmitWaitcnt[EXP_CNT] = false; 437 } 438 439 if (DebugCounter::isCounterSet(ForceLgkmCounter) && 440 DebugCounter::shouldExecute(ForceLgkmCounter)) { 441 ForceEmitWaitcnt[LGKM_CNT] = true; 442 } else { 443 ForceEmitWaitcnt[LGKM_CNT] = false; 444 } 445 446 if (DebugCounter::isCounterSet(ForceVMCounter) && 447 DebugCounter::shouldExecute(ForceVMCounter)) { 448 ForceEmitWaitcnt[VM_CNT] = true; 449 } else { 450 ForceEmitWaitcnt[VM_CNT] = false; 451 } 452 #endif // NDEBUG 453 } 454 455 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or 456 // FLAT instruction. 457 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { 458 assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); 459 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side 460 // these should use VM_CNT. 461 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) 462 return VMEM_ACCESS; 463 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) { 464 // FLAT and SCRATCH instructions may access scratch. Other VMEM 465 // instructions do not. 466 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) 467 return SCRATCH_WRITE_ACCESS; 468 return VMEM_WRITE_ACCESS; 469 } 470 return VMEM_READ_ACCESS; 471 } 472 473 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; 474 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; 475 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; 476 bool generateWaitcntInstBefore(MachineInstr &MI, 477 WaitcntBrackets &ScoreBrackets, 478 MachineInstr *OldWaitcntInstr, 479 bool FlushVmCnt); 480 bool generateWaitcntBlockEnd(MachineBasicBlock &Block, 481 WaitcntBrackets &ScoreBrackets, 482 MachineInstr *OldWaitcntInstr); 483 bool generateWaitcnt(AMDGPU::Waitcnt Wait, 484 MachineBasicBlock::instr_iterator It, 485 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, 486 MachineInstr *OldWaitcntInstr); 487 void updateEventWaitcntAfter(MachineInstr &Inst, 488 WaitcntBrackets *ScoreBrackets); 489 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, 490 WaitcntBrackets &ScoreBrackets); 491 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, 492 MachineInstr &OldWaitcntInstr, 493 AMDGPU::Waitcnt &Wait, 494 MachineBasicBlock::instr_iterator It) const; 495 496 // Transform a soft waitcnt into a normal one. 497 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; 498 }; 499 500 } // end anonymous namespace 501 502 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, 503 const SIInstrInfo *TII, 504 const MachineRegisterInfo *MRI, 505 const SIRegisterInfo *TRI, 506 unsigned OpNo) const { 507 const MachineOperand &Op = MI->getOperand(OpNo); 508 if (!TRI->isInAllocatableClass(Op.getReg())) 509 return {-1, -1}; 510 511 // A use via a PW operand does not need a waitcnt. 512 // A partial write is not a WAW. 513 assert(!Op.getSubReg() || !Op.isUndef()); 514 515 RegInterval Result; 516 517 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & 518 AMDGPU::HWEncoding::REG_IDX_MASK; 519 520 if (TRI->isVectorRegister(*MRI, Op.getReg())) { 521 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); 522 Result.first = Reg - Encoding.VGPR0; 523 if (TRI->isAGPR(*MRI, Op.getReg())) 524 Result.first += AGPR_OFFSET; 525 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); 526 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { 527 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); 528 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; 529 assert(Result.first >= NUM_ALL_VGPRS && 530 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); 531 } 532 // TODO: Handle TTMP 533 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... 534 else 535 return {-1, -1}; 536 537 const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo); 538 unsigned Size = TRI->getRegSizeInBits(*RC); 539 Result.second = Result.first + ((Size + 16) / 32); 540 541 return Result; 542 } 543 544 void WaitcntBrackets::setExpScore(const MachineInstr *MI, 545 const SIInstrInfo *TII, 546 const SIRegisterInfo *TRI, 547 const MachineRegisterInfo *MRI, unsigned OpNo, 548 unsigned Val) { 549 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); 550 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); 551 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 552 setRegScore(RegNo, EXP_CNT, Val); 553 } 554 } 555 556 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, 557 const SIRegisterInfo *TRI, 558 const MachineRegisterInfo *MRI, 559 WaitEventType E, MachineInstr &Inst) { 560 InstCounterType T = eventCounter(E); 561 unsigned CurrScore = getScoreUB(T) + 1; 562 if (CurrScore == 0) 563 report_fatal_error("InsertWaitcnt score wraparound"); 564 // PendingEvents and ScoreUB need to be update regardless if this event 565 // changes the score of a register or not. 566 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. 567 PendingEvents |= 1 << E; 568 setScoreUB(T, CurrScore); 569 570 if (T == EXP_CNT) { 571 // Put score on the source vgprs. If this is a store, just use those 572 // specific register(s). 573 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { 574 int AddrOpIdx = 575 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); 576 // All GDS operations must protect their address register (same as 577 // export.) 578 if (AddrOpIdx != -1) { 579 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); 580 } 581 582 if (Inst.mayStore()) { 583 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) { 584 setExpScore( 585 &Inst, TII, TRI, MRI, 586 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), 587 CurrScore); 588 } 589 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) { 590 setExpScore(&Inst, TII, TRI, MRI, 591 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), 592 AMDGPU::OpName::data1), 593 CurrScore); 594 } 595 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && 596 Inst.getOpcode() != AMDGPU::DS_APPEND && 597 Inst.getOpcode() != AMDGPU::DS_CONSUME && 598 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { 599 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 600 const MachineOperand &Op = Inst.getOperand(I); 601 if (Op.isReg() && !Op.isDef() && 602 TRI->isVectorRegister(*MRI, Op.getReg())) { 603 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 604 } 605 } 606 } 607 } else if (TII->isFLAT(Inst)) { 608 if (Inst.mayStore()) { 609 setExpScore( 610 &Inst, TII, TRI, MRI, 611 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 612 CurrScore); 613 } else if (SIInstrInfo::isAtomicRet(Inst)) { 614 setExpScore( 615 &Inst, TII, TRI, MRI, 616 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 617 CurrScore); 618 } 619 } else if (TII->isMIMG(Inst)) { 620 if (Inst.mayStore()) { 621 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 622 } else if (SIInstrInfo::isAtomicRet(Inst)) { 623 setExpScore( 624 &Inst, TII, TRI, MRI, 625 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 626 CurrScore); 627 } 628 } else if (TII->isMTBUF(Inst)) { 629 if (Inst.mayStore()) { 630 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 631 } 632 } else if (TII->isMUBUF(Inst)) { 633 if (Inst.mayStore()) { 634 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); 635 } else if (SIInstrInfo::isAtomicRet(Inst)) { 636 setExpScore( 637 &Inst, TII, TRI, MRI, 638 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), 639 CurrScore); 640 } 641 } else if (TII->isLDSDIR(Inst)) { 642 // LDSDIR instructions attach the score to the destination. 643 setExpScore( 644 &Inst, TII, TRI, MRI, 645 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), 646 CurrScore); 647 } else { 648 if (TII->isEXP(Inst)) { 649 // For export the destination registers are really temps that 650 // can be used as the actual source after export patching, so 651 // we need to treat them like sources and set the EXP_CNT 652 // score. 653 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 654 MachineOperand &DefMO = Inst.getOperand(I); 655 if (DefMO.isReg() && DefMO.isDef() && 656 TRI->isVGPR(*MRI, DefMO.getReg())) { 657 setRegScore( 658 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), 659 EXP_CNT, CurrScore); 660 } 661 } 662 } 663 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 664 MachineOperand &MO = Inst.getOperand(I); 665 if (MO.isReg() && !MO.isDef() && 666 TRI->isVectorRegister(*MRI, MO.getReg())) { 667 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); 668 } 669 } 670 } 671 #if 0 // TODO: check if this is handled by MUBUF code above. 672 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || 673 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || 674 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { 675 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); 676 unsigned OpNo;//TODO: find the OpNo for this operand; 677 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo); 678 for (int RegNo = Interval.first; RegNo < Interval.second; 679 ++RegNo) { 680 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); 681 } 682 #endif 683 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { 684 // Match the score to the destination registers. 685 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { 686 auto &Op = Inst.getOperand(I); 687 if (!Op.isReg() || !Op.isDef()) 688 continue; 689 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I); 690 if (T == VM_CNT) { 691 if (Interval.first >= NUM_ALL_VGPRS) 692 continue; 693 if (updateVMCntOnly(Inst)) { 694 // updateVMCntOnly should only leave us with VGPRs 695 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR 696 // defs. That's required for a sane index into `VgprMemTypes` below 697 assert(TRI->isVectorRegister(*MRI, Op.getReg())); 698 VmemType V = getVmemType(Inst); 699 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) 700 VgprVmemTypes[RegNo] |= 1 << V; 701 } 702 } 703 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 704 setRegScore(RegNo, T, CurrScore); 705 } 706 } 707 if (Inst.mayStore() && 708 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { 709 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS 710 // written can be accessed. A load from LDS to VMEM does not need a wait. 711 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); 712 } 713 } 714 } 715 716 void WaitcntBrackets::print(raw_ostream &OS) { 717 OS << '\n'; 718 for (auto T : inst_counter_types()) { 719 unsigned SR = getScoreRange(T); 720 721 switch (T) { 722 case VM_CNT: 723 OS << " VM_CNT(" << SR << "): "; 724 break; 725 case LGKM_CNT: 726 OS << " LGKM_CNT(" << SR << "): "; 727 break; 728 case EXP_CNT: 729 OS << " EXP_CNT(" << SR << "): "; 730 break; 731 case VS_CNT: 732 OS << " VS_CNT(" << SR << "): "; 733 break; 734 default: 735 OS << " UNKNOWN(" << SR << "): "; 736 break; 737 } 738 739 if (SR != 0) { 740 // Print vgpr scores. 741 unsigned LB = getScoreLB(T); 742 743 for (int J = 0; J <= VgprUB; J++) { 744 unsigned RegScore = getRegScore(J, T); 745 if (RegScore <= LB) 746 continue; 747 unsigned RelScore = RegScore - LB - 1; 748 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { 749 OS << RelScore << ":v" << J << " "; 750 } else { 751 OS << RelScore << ":ds "; 752 } 753 } 754 // Also need to print sgpr scores for lgkm_cnt. 755 if (T == LGKM_CNT) { 756 for (int J = 0; J <= SgprUB; J++) { 757 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); 758 if (RegScore <= LB) 759 continue; 760 unsigned RelScore = RegScore - LB - 1; 761 OS << RelScore << ":s" << J << " "; 762 } 763 } 764 } 765 OS << '\n'; 766 } 767 OS << '\n'; 768 } 769 770 /// Simplify the waitcnt, in the sense of removing redundant counts, and return 771 /// whether a waitcnt instruction is needed at all. 772 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { 773 simplifyWaitcnt(VM_CNT, Wait.VmCnt); 774 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); 775 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); 776 simplifyWaitcnt(VS_CNT, Wait.VsCnt); 777 } 778 779 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, 780 unsigned &Count) const { 781 // The number of outstanding events for this type, T, can be calculated 782 // as (UB - LB). If the current Count is greater than or equal to the number 783 // of outstanding events, then the wait for this counter is redundant. 784 if (Count >= getScoreRange(T)) 785 Count = ~0u; 786 } 787 788 void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, 789 AMDGPU::Waitcnt &Wait) const { 790 unsigned ScoreToWait = getRegScore(RegNo, T); 791 792 // If the score of src_operand falls within the bracket, we need an 793 // s_waitcnt instruction. 794 const unsigned LB = getScoreLB(T); 795 const unsigned UB = getScoreUB(T); 796 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { 797 if ((T == VM_CNT || T == LGKM_CNT) && 798 hasPendingFlat() && 799 !ST->hasFlatLgkmVMemCountInOrder()) { 800 // If there is a pending FLAT operation, and this is a VMem or LGKM 801 // waitcnt and the target can report early completion, then we need 802 // to force a waitcnt 0. 803 addWait(Wait, T, 0); 804 } else if (counterOutOfOrder(T)) { 805 // Counter can get decremented out-of-order when there 806 // are multiple types event in the bracket. Also emit an s_wait counter 807 // with a conservative value of 0 for the counter. 808 addWait(Wait, T, 0); 809 } else { 810 // If a counter has been maxed out avoid overflow by waiting for 811 // MAX(CounterType) - 1 instead. 812 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); 813 addWait(Wait, T, NeededWait); 814 } 815 } 816 } 817 818 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { 819 applyWaitcnt(VM_CNT, Wait.VmCnt); 820 applyWaitcnt(EXP_CNT, Wait.ExpCnt); 821 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); 822 applyWaitcnt(VS_CNT, Wait.VsCnt); 823 } 824 825 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { 826 const unsigned UB = getScoreUB(T); 827 if (Count >= UB) 828 return; 829 if (Count != 0) { 830 if (counterOutOfOrder(T)) 831 return; 832 setScoreLB(T, std::max(getScoreLB(T), UB - Count)); 833 } else { 834 setScoreLB(T, UB); 835 PendingEvents &= ~WaitEventMaskForInst[T]; 836 } 837 } 838 839 // Where there are multiple types of event in the bracket of a counter, 840 // the decrement may go out of order. 841 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { 842 // Scalar memory read always can go out of order. 843 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) 844 return true; 845 return hasMixedPendingEvents(T); 846 } 847 848 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 849 false) 850 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 851 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 852 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, 853 false) 854 855 char SIInsertWaitcnts::ID = 0; 856 857 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; 858 859 FunctionPass *llvm::createSIInsertWaitcntsPass() { 860 return new SIInsertWaitcnts(); 861 } 862 863 static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, 864 unsigned NewEnc) { 865 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 866 assert(OpIdx >= 0); 867 868 MachineOperand &MO = MI.getOperand(OpIdx); 869 870 if (NewEnc == MO.getImm()) 871 return false; 872 873 MO.setImm(NewEnc); 874 return true; 875 } 876 877 bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { 878 unsigned Opcode = Waitcnt->getOpcode(); 879 if (!SIInstrInfo::isSoftWaitcnt(Opcode)) 880 return false; 881 882 Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); 883 return true; 884 } 885 886 /// Combine consecutive waitcnt instructions that precede \p It and follow 887 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added 888 /// by previous passes. Currently this pass conservatively assumes that these 889 /// preexisting waitcnt are required for correctness. 890 bool SIInsertWaitcnts::applyPreexistingWaitcnt( 891 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, 892 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { 893 bool Modified = false; 894 MachineInstr *WaitcntInstr = nullptr; 895 MachineInstr *WaitcntVsCntInstr = nullptr; 896 897 for (auto &II : 898 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { 899 if (II.isMetaInstruction()) 900 continue; 901 902 unsigned Opcode = II.getOpcode(); 903 bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode); 904 905 if (SIInstrInfo::isWaitcnt(Opcode)) { 906 // Update required wait count. If this is a soft waitcnt (= it was added 907 // by an earlier pass), it may be entirely removed. 908 unsigned IEnc = II.getOperand(0).getImm(); 909 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); 910 if (IsSoft) 911 ScoreBrackets.simplifyWaitcnt(OldWait); 912 Wait = Wait.combined(OldWait); 913 914 // Merge consecutive waitcnt of the same type by erasing multiples. 915 if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) { 916 II.eraseFromParent(); 917 Modified = true; 918 } else 919 WaitcntInstr = &II; 920 921 } else { 922 assert(SIInstrInfo::isWaitcntVsCnt(Opcode)); 923 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 924 925 unsigned OldVSCnt = 926 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); 927 if (IsSoft) 928 ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt); 929 Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); 930 931 if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) { 932 II.eraseFromParent(); 933 Modified = true; 934 } else 935 WaitcntVsCntInstr = &II; 936 } 937 } 938 939 // Updated encoding of merged waitcnt with the required wait. 940 if (WaitcntInstr) { 941 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, 942 AMDGPU::encodeWaitcnt(IV, Wait)); 943 Modified |= promoteSoftWaitCnt(WaitcntInstr); 944 945 ScoreBrackets.applyWaitcnt(Wait); 946 Wait.VmCnt = ~0u; 947 Wait.LgkmCnt = ~0u; 948 Wait.ExpCnt = ~0u; 949 950 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 951 ? dbgs() 952 << "applyPreexistingWaitcnt\n" 953 << "New Instr at block end: " << *WaitcntInstr << '\n' 954 : dbgs() << "applyPreexistingWaitcnt\n" 955 << "Old Instr: " << *It 956 << "New Instr: " << *WaitcntInstr << '\n'); 957 } 958 959 if (WaitcntVsCntInstr) { 960 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, 961 AMDGPU::OpName::simm16, Wait.VsCnt); 962 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); 963 ScoreBrackets.applyWaitcnt(Wait); 964 Wait.VsCnt = ~0u; 965 966 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() 967 ? dbgs() << "applyPreexistingWaitcnt\n" 968 << "New Instr at block end: " << *WaitcntVsCntInstr 969 << '\n' 970 : dbgs() << "applyPreexistingWaitcnt\n" 971 << "Old Instr: " << *It 972 << "New Instr: " << *WaitcntVsCntInstr << '\n'); 973 } 974 975 return Modified; 976 } 977 978 static bool readsVCCZ(const MachineInstr &MI) { 979 unsigned Opc = MI.getOpcode(); 980 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && 981 !MI.getOperand(1).isUndef(); 982 } 983 984 /// \returns true if the callee inserts an s_waitcnt 0 on function entry. 985 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { 986 // Currently all conventions wait, but this may not always be the case. 987 // 988 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make 989 // senses to omit the wait and do it in the caller. 990 return true; 991 } 992 993 /// \returns true if the callee is expected to wait for any outstanding waits 994 /// before returning. 995 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { 996 return true; 997 } 998 999 /// Generate s_waitcnt instruction to be placed before cur_Inst. 1000 /// Instructions of a given type are returned in order, 1001 /// but instructions of different types can complete out of order. 1002 /// We rely on this in-order completion 1003 /// and simply assign a score to the memory access instructions. 1004 /// We keep track of the active "score bracket" to determine 1005 /// if an access of a memory read requires an s_waitcnt 1006 /// and if so what the value of each counter is. 1007 /// The "score bracket" is bound by the lower bound and upper bound 1008 /// scores (*_score_LB and *_score_ub respectively). 1009 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to 1010 /// flush the vmcnt counter here. 1011 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, 1012 WaitcntBrackets &ScoreBrackets, 1013 MachineInstr *OldWaitcntInstr, 1014 bool FlushVmCnt) { 1015 setForceEmitWaitcnt(); 1016 1017 if (MI.isMetaInstruction()) 1018 return false; 1019 1020 AMDGPU::Waitcnt Wait; 1021 1022 // FIXME: This should have already been handled by the memory legalizer. 1023 // Removing this currently doesn't affect any lit tests, but we need to 1024 // verify that nothing was relying on this. The number of buffer invalidates 1025 // being handled here should not be expanded. 1026 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || 1027 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || 1028 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || 1029 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || 1030 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { 1031 Wait.VmCnt = 0; 1032 } 1033 1034 // All waits must be resolved at call return. 1035 // NOTE: this could be improved with knowledge of all call sites or 1036 // with knowledge of the called routines. 1037 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || 1038 MI.getOpcode() == AMDGPU::SI_RETURN || 1039 MI.getOpcode() == AMDGPU::S_SETPC_B64_return || 1040 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { 1041 Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt()); 1042 } 1043 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM 1044 // stores. In this case it can be useful to send a message to explicitly 1045 // release all VGPRs before the stores have completed, but it is only safe to 1046 // do this if there are no outstanding scratch stores. 1047 else if (MI.getOpcode() == AMDGPU::S_ENDPGM || 1048 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { 1049 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && 1050 ScoreBrackets.getScoreRange(VS_CNT) != 0 && 1051 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) 1052 ReleaseVGPRInsts.insert(&MI); 1053 } 1054 // Resolve vm waits before gs-done. 1055 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || 1056 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && 1057 ST->hasLegacyGeometry() && 1058 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == 1059 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { 1060 Wait.VmCnt = 0; 1061 } 1062 #if 0 // TODO: the following blocks of logic when we have fence. 1063 else if (MI.getOpcode() == SC_FENCE) { 1064 const unsigned int group_size = 1065 context->shader_info->GetMaxThreadGroupSize(); 1066 // group_size == 0 means thread group size is unknown at compile time 1067 const bool group_is_multi_wave = 1068 (group_size == 0 || group_size > target_info->GetWaveFrontSize()); 1069 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); 1070 1071 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { 1072 SCRegType src_type = Inst->GetSrcType(i); 1073 switch (src_type) { 1074 case SCMEM_LDS: 1075 if (group_is_multi_wave || 1076 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { 1077 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 1078 ScoreBrackets->getScoreUB(LGKM_CNT)); 1079 // LDS may have to wait for VM_CNT after buffer load to LDS 1080 if (target_info->HasBufferLoadToLDS()) { 1081 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 1082 ScoreBrackets->getScoreUB(VM_CNT)); 1083 } 1084 } 1085 break; 1086 1087 case SCMEM_GDS: 1088 if (group_is_multi_wave || fence_is_global) { 1089 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 1090 ScoreBrackets->getScoreUB(EXP_CNT)); 1091 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, 1092 ScoreBrackets->getScoreUB(LGKM_CNT)); 1093 } 1094 break; 1095 1096 case SCMEM_UAV: 1097 case SCMEM_TFBUF: 1098 case SCMEM_RING: 1099 case SCMEM_SCATTER: 1100 if (group_is_multi_wave || fence_is_global) { 1101 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, 1102 ScoreBrackets->getScoreUB(EXP_CNT)); 1103 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, 1104 ScoreBrackets->getScoreUB(VM_CNT)); 1105 } 1106 break; 1107 1108 case SCMEM_SCRATCH: 1109 default: 1110 break; 1111 } 1112 } 1113 } 1114 #endif 1115 1116 // Export & GDS instructions do not read the EXEC mask until after the export 1117 // is granted (which can occur well after the instruction is issued). 1118 // The shader program must flush all EXP operations on the export-count 1119 // before overwriting the EXEC mask. 1120 else { 1121 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { 1122 // Export and GDS are tracked individually, either may trigger a waitcnt 1123 // for EXEC. 1124 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || 1125 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || 1126 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || 1127 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { 1128 Wait.ExpCnt = 0; 1129 } 1130 } 1131 1132 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { 1133 // The function is going to insert a wait on everything in its prolog. 1134 // This still needs to be careful if the call target is a load (e.g. a GOT 1135 // load). We also need to check WAW dependency with saved PC. 1136 Wait = AMDGPU::Waitcnt(); 1137 1138 int CallAddrOpIdx = 1139 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 1140 1141 if (MI.getOperand(CallAddrOpIdx).isReg()) { 1142 RegInterval CallAddrOpInterval = 1143 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx); 1144 1145 for (int RegNo = CallAddrOpInterval.first; 1146 RegNo < CallAddrOpInterval.second; ++RegNo) 1147 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); 1148 1149 int RtnAddrOpIdx = 1150 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 1151 if (RtnAddrOpIdx != -1) { 1152 RegInterval RtnAddrOpInterval = 1153 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx); 1154 1155 for (int RegNo = RtnAddrOpInterval.first; 1156 RegNo < RtnAddrOpInterval.second; ++RegNo) 1157 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); 1158 } 1159 } 1160 } else { 1161 // FIXME: Should not be relying on memoperands. 1162 // Look at the source operands of every instruction to see if 1163 // any of them results from a previous memory operation that affects 1164 // its current usage. If so, an s_waitcnt instruction needs to be 1165 // emitted. 1166 // If the source operand was defined by a load, add the s_waitcnt 1167 // instruction. 1168 // 1169 // Two cases are handled for destination operands: 1170 // 1) If the destination operand was defined by a load, add the s_waitcnt 1171 // instruction to guarantee the right WAW order. 1172 // 2) If a destination operand that was used by a recent export/store ins, 1173 // add s_waitcnt on exp_cnt to guarantee the WAR order. 1174 for (const MachineMemOperand *Memop : MI.memoperands()) { 1175 const Value *Ptr = Memop->getValue(); 1176 if (Memop->isStore() && SLoadAddresses.count(Ptr)) { 1177 addWait(Wait, LGKM_CNT, 0); 1178 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) 1179 SLoadAddresses.erase(Ptr); 1180 } 1181 unsigned AS = Memop->getAddrSpace(); 1182 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) 1183 continue; 1184 // No need to wait before load from VMEM to LDS. 1185 if (TII->mayWriteLDSThroughDMA(MI)) 1186 continue; 1187 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; 1188 // VM_CNT is only relevant to vgpr or LDS. 1189 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); 1190 if (Memop->isStore()) { 1191 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1192 } 1193 } 1194 1195 // Loop over use and def operands. 1196 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 1197 MachineOperand &Op = MI.getOperand(I); 1198 if (!Op.isReg()) 1199 continue; 1200 1201 // If the instruction does not read tied source, skip the operand. 1202 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) 1203 continue; 1204 1205 RegInterval Interval = 1206 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); 1207 1208 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); 1209 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1210 if (IsVGPR) { 1211 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the 1212 // previous write and this write are the same type of VMEM 1213 // instruction, in which case they're guaranteed to write their 1214 // results in order anyway. 1215 if (Op.isUse() || !updateVMCntOnly(MI) || 1216 ScoreBrackets.hasOtherPendingVmemTypes(RegNo, 1217 getVmemType(MI))) { 1218 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); 1219 ScoreBrackets.clearVgprVmemTypes(RegNo); 1220 } 1221 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { 1222 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); 1223 } 1224 } 1225 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); 1226 } 1227 } 1228 } 1229 } 1230 1231 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does 1232 // not, we need to ensure the subtarget is capable of backing off barrier 1233 // instructions in case there are any outstanding memory operations that may 1234 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. 1235 if (MI.getOpcode() == AMDGPU::S_BARRIER && 1236 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { 1237 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); 1238 } 1239 1240 // TODO: Remove this work-around, enable the assert for Bug 457939 1241 // after fixing the scheduler. Also, the Shader Compiler code is 1242 // independent of target. 1243 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { 1244 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1245 Wait.LgkmCnt = 0; 1246 } 1247 } 1248 1249 // Verify that the wait is actually needed. 1250 ScoreBrackets.simplifyWaitcnt(Wait); 1251 1252 if (ForceEmitZeroWaitcnts) 1253 Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(); 1254 1255 if (ForceEmitWaitcnt[VM_CNT]) 1256 Wait.VmCnt = 0; 1257 if (ForceEmitWaitcnt[EXP_CNT]) 1258 Wait.ExpCnt = 0; 1259 if (ForceEmitWaitcnt[LGKM_CNT]) 1260 Wait.LgkmCnt = 0; 1261 1262 if (FlushVmCnt) { 1263 if (ScoreBrackets.hasPendingEvent(VM_CNT)) 1264 Wait.VmCnt = 0; 1265 } 1266 1267 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, 1268 OldWaitcntInstr); 1269 } 1270 1271 // Add a waitcnt to flush the vmcnt counter at the end of the given block if 1272 // needed. 1273 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, 1274 WaitcntBrackets &ScoreBrackets, 1275 MachineInstr *OldWaitcntInstr) { 1276 AMDGPU::Waitcnt Wait; 1277 1278 if (!ScoreBrackets.hasPendingEvent(VM_CNT)) 1279 return false; 1280 1281 Wait.VmCnt = 0; 1282 1283 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, 1284 OldWaitcntInstr); 1285 } 1286 1287 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, 1288 MachineBasicBlock::instr_iterator It, 1289 MachineBasicBlock &Block, 1290 WaitcntBrackets &ScoreBrackets, 1291 MachineInstr *OldWaitcntInstr) { 1292 bool Modified = false; 1293 const DebugLoc &DL = Block.findDebugLoc(It); 1294 1295 if (OldWaitcntInstr) 1296 // Try to merge the required wait with preexisting waitcnt instructions. 1297 // Also erase redundant waitcnt. 1298 Modified = 1299 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); 1300 else 1301 ScoreBrackets.applyWaitcnt(Wait); 1302 1303 // ExpCnt can be merged into VINTERP. 1304 if (Wait.ExpCnt != ~0u && It != Block.instr_end() && 1305 SIInstrInfo::isVINTERP(*It)) { 1306 MachineOperand *WaitExp = 1307 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); 1308 if (Wait.ExpCnt < WaitExp->getImm()) { 1309 WaitExp->setImm(Wait.ExpCnt); 1310 Modified = true; 1311 } 1312 Wait.ExpCnt = ~0u; 1313 1314 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" 1315 << "Update Instr: " << *It); 1316 } 1317 1318 // Build new waitcnt instructions unless no wait is needed or the old waitcnt 1319 // instruction was modified to handle the required wait. 1320 if (Wait.hasWaitExceptVsCnt()) { 1321 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); 1322 [[maybe_unused]] auto SWaitInst = 1323 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); 1324 Modified = true; 1325 1326 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1327 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1328 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1329 } 1330 1331 if (Wait.hasWaitVsCnt()) { 1332 assert(ST->hasVscnt()); 1333 1334 [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1335 .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1336 .addImm(Wait.VsCnt); 1337 Modified = true; 1338 1339 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; 1340 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; 1341 dbgs() << "New Instr: " << *SWaitInst << '\n'); 1342 } 1343 return Modified; 1344 } 1345 1346 // This is a flat memory operation. Check to see if it has memory tokens other 1347 // than LDS. Other address spaces supported by flat memory operations involve 1348 // global memory. 1349 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { 1350 assert(TII->isFLAT(MI)); 1351 1352 // All flat instructions use the VMEM counter. 1353 assert(TII->usesVM_CNT(MI)); 1354 1355 // If there are no memory operands then conservatively assume the flat 1356 // operation may access VMEM. 1357 if (MI.memoperands_empty()) 1358 return true; 1359 1360 // See if any memory operand specifies an address space that involves VMEM. 1361 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces 1362 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION 1363 // (GDS) address space is not supported by flat operations. Therefore, simply 1364 // return true unless only the LDS address space is found. 1365 for (const MachineMemOperand *Memop : MI.memoperands()) { 1366 unsigned AS = Memop->getAddrSpace(); 1367 assert(AS != AMDGPUAS::REGION_ADDRESS); 1368 if (AS != AMDGPUAS::LOCAL_ADDRESS) 1369 return true; 1370 } 1371 1372 return false; 1373 } 1374 1375 // This is a flat memory operation. Check to see if it has memory tokens for 1376 // either LDS or FLAT. 1377 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { 1378 assert(TII->isFLAT(MI)); 1379 1380 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. 1381 if (!TII->usesLGKM_CNT(MI)) 1382 return false; 1383 1384 // If in tgsplit mode then there can be no use of LDS. 1385 if (ST->isTgSplitEnabled()) 1386 return false; 1387 1388 // If there are no memory operands then conservatively assume the flat 1389 // operation may access LDS. 1390 if (MI.memoperands_empty()) 1391 return true; 1392 1393 // See if any memory operand specifies an address space that involves LDS. 1394 for (const MachineMemOperand *Memop : MI.memoperands()) { 1395 unsigned AS = Memop->getAddrSpace(); 1396 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1397 return true; 1398 } 1399 1400 return false; 1401 } 1402 1403 // This is a flat memory operation. Check to see if it has memory tokens for 1404 // either scratch or FLAT. 1405 bool SIInsertWaitcnts::mayAccessScratchThroughFlat( 1406 const MachineInstr &MI) const { 1407 assert(TII->isFLAT(MI)); 1408 1409 // SCRATCH instructions always access scratch. 1410 if (TII->isFLATScratch(MI)) 1411 return true; 1412 1413 // GLOBAL instructions never access scratch. 1414 if (TII->isFLATGlobal(MI)) 1415 return false; 1416 1417 // If there are no memory operands then conservatively assume the flat 1418 // operation may access scratch. 1419 if (MI.memoperands_empty()) 1420 return true; 1421 1422 // See if any memory operand specifies an address space that involves scratch. 1423 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { 1424 unsigned AS = Memop->getAddrSpace(); 1425 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 1426 }); 1427 } 1428 1429 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, 1430 WaitcntBrackets *ScoreBrackets) { 1431 // Now look at the instruction opcode. If it is a memory access 1432 // instruction, update the upper-bound of the appropriate counter's 1433 // bracket and the destination operand scores. 1434 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. 1435 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { 1436 if (TII->isAlwaysGDS(Inst.getOpcode()) || 1437 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { 1438 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); 1439 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); 1440 } else { 1441 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1442 } 1443 } else if (TII->isFLAT(Inst)) { 1444 assert(Inst.mayLoadOrStore()); 1445 1446 int FlatASCount = 0; 1447 1448 if (mayAccessVMEMThroughFlat(Inst)) { 1449 ++FlatASCount; 1450 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 1451 Inst); 1452 } 1453 1454 if (mayAccessLDSThroughFlat(Inst)) { 1455 ++FlatASCount; 1456 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); 1457 } 1458 1459 // A Flat memory operation must access at least one address space. 1460 assert(FlatASCount); 1461 1462 // This is a flat memory operation that access both VMEM and LDS, so note it 1463 // - it will require that both the VM and LGKM be flushed to zero if it is 1464 // pending when a VM or LGKM dependency occurs. 1465 if (FlatASCount > 1) 1466 ScoreBrackets->setPendingFlat(); 1467 } else if (SIInstrInfo::isVMEM(Inst) && 1468 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { 1469 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), 1470 Inst); 1471 1472 if (ST->vmemWriteNeedsExpWaitcnt() && 1473 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { 1474 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); 1475 } 1476 } else if (TII->isSMRD(Inst)) { 1477 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1478 } else if (Inst.isCall()) { 1479 if (callWaitsOnFunctionReturn(Inst)) { 1480 // Act as a wait on everything 1481 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); 1482 } else { 1483 // May need to way wait for anything. 1484 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); 1485 } 1486 } else if (SIInstrInfo::isLDSDIR(Inst)) { 1487 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); 1488 } else if (TII->isVINTERP(Inst)) { 1489 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); 1490 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); 1491 } else if (SIInstrInfo::isEXP(Inst)) { 1492 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); 1493 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) 1494 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); 1495 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) 1496 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); 1497 else 1498 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); 1499 } else { 1500 switch (Inst.getOpcode()) { 1501 case AMDGPU::S_SENDMSG: 1502 case AMDGPU::S_SENDMSG_RTN_B32: 1503 case AMDGPU::S_SENDMSG_RTN_B64: 1504 case AMDGPU::S_SENDMSGHALT: 1505 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); 1506 break; 1507 case AMDGPU::S_MEMTIME: 1508 case AMDGPU::S_MEMREALTIME: 1509 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: 1510 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: 1511 case AMDGPU::S_BARRIER_LEAVE: 1512 case AMDGPU::S_GET_BARRIER_STATE_M0: 1513 case AMDGPU::S_GET_BARRIER_STATE_IMM: 1514 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); 1515 break; 1516 } 1517 } 1518 } 1519 1520 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, 1521 unsigned OtherScore) { 1522 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; 1523 unsigned OtherShifted = 1524 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; 1525 Score = std::max(MyShifted, OtherShifted); 1526 return OtherShifted > MyShifted; 1527 } 1528 1529 /// Merge the pending events and associater score brackets of \p Other into 1530 /// this brackets status. 1531 /// 1532 /// Returns whether the merge resulted in a change that requires tighter waits 1533 /// (i.e. the merged brackets strictly dominate the original brackets). 1534 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { 1535 bool StrictDom = false; 1536 1537 VgprUB = std::max(VgprUB, Other.VgprUB); 1538 SgprUB = std::max(SgprUB, Other.SgprUB); 1539 1540 for (auto T : inst_counter_types()) { 1541 // Merge event flags for this counter 1542 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; 1543 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; 1544 if (OtherEvents & ~OldEvents) 1545 StrictDom = true; 1546 PendingEvents |= OtherEvents; 1547 1548 // Merge scores for this counter 1549 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; 1550 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; 1551 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); 1552 if (NewUB < ScoreLBs[T]) 1553 report_fatal_error("waitcnt score overflow"); 1554 1555 MergeInfo M; 1556 M.OldLB = ScoreLBs[T]; 1557 M.OtherLB = Other.ScoreLBs[T]; 1558 M.MyShift = NewUB - ScoreUBs[T]; 1559 M.OtherShift = NewUB - Other.ScoreUBs[T]; 1560 1561 ScoreUBs[T] = NewUB; 1562 1563 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); 1564 1565 for (int J = 0; J <= VgprUB; J++) 1566 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); 1567 1568 if (T == LGKM_CNT) { 1569 for (int J = 0; J <= SgprUB; J++) 1570 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); 1571 } 1572 } 1573 1574 for (int J = 0; J <= VgprUB; J++) { 1575 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; 1576 StrictDom |= NewVmemTypes != VgprVmemTypes[J]; 1577 VgprVmemTypes[J] = NewVmemTypes; 1578 } 1579 1580 return StrictDom; 1581 } 1582 1583 static bool isWaitInstr(MachineInstr &Inst) { 1584 auto Opcode = Inst.getOpcode(); 1585 return SIInstrInfo::isWaitcnt(Opcode) || 1586 (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() && 1587 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1588 } 1589 1590 // Generate s_waitcnt instructions where needed. 1591 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, 1592 MachineBasicBlock &Block, 1593 WaitcntBrackets &ScoreBrackets) { 1594 bool Modified = false; 1595 1596 LLVM_DEBUG({ 1597 dbgs() << "*** Block" << Block.getNumber() << " ***"; 1598 ScoreBrackets.dump(); 1599 }); 1600 1601 // Track the correctness of vccz through this basic block. There are two 1602 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and 1603 // ST->partialVCCWritesUpdateVCCZ(). 1604 bool VCCZCorrect = true; 1605 if (ST->hasReadVCCZBug()) { 1606 // vccz could be incorrect at a basic block boundary if a predecessor wrote 1607 // to vcc and then issued an smem load. 1608 VCCZCorrect = false; 1609 } else if (!ST->partialVCCWritesUpdateVCCZ()) { 1610 // vccz could be incorrect at a basic block boundary if a predecessor wrote 1611 // to vcc_lo or vcc_hi. 1612 VCCZCorrect = false; 1613 } 1614 1615 // Walk over the instructions. 1616 MachineInstr *OldWaitcntInstr = nullptr; 1617 1618 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), 1619 E = Block.instr_end(); 1620 Iter != E;) { 1621 MachineInstr &Inst = *Iter; 1622 1623 // Track pre-existing waitcnts that were added in earlier iterations or by 1624 // the memory legalizer. 1625 if (isWaitInstr(Inst)) { 1626 if (!OldWaitcntInstr) 1627 OldWaitcntInstr = &Inst; 1628 ++Iter; 1629 continue; 1630 } 1631 1632 bool FlushVmCnt = Block.getFirstTerminator() == Inst && 1633 isPreheaderToFlush(Block, ScoreBrackets); 1634 1635 // Generate an s_waitcnt instruction to be placed before Inst, if needed. 1636 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, 1637 FlushVmCnt); 1638 OldWaitcntInstr = nullptr; 1639 1640 // Restore vccz if it's not known to be correct already. 1641 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); 1642 1643 // Don't examine operands unless we need to track vccz correctness. 1644 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { 1645 if (Inst.definesRegister(AMDGPU::VCC_LO) || 1646 Inst.definesRegister(AMDGPU::VCC_HI)) { 1647 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. 1648 if (!ST->partialVCCWritesUpdateVCCZ()) 1649 VCCZCorrect = false; 1650 } else if (Inst.definesRegister(AMDGPU::VCC)) { 1651 // There is a hardware bug on CI/SI where SMRD instruction may corrupt 1652 // vccz bit, so when we detect that an instruction may read from a 1653 // corrupt vccz bit, we need to: 1654 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD 1655 // operations to complete. 1656 // 2. Restore the correct value of vccz by writing the current value 1657 // of vcc back to vcc. 1658 if (ST->hasReadVCCZBug() && 1659 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { 1660 // Writes to vcc while there's an outstanding smem read may get 1661 // clobbered as soon as any read completes. 1662 VCCZCorrect = false; 1663 } else { 1664 // Writes to vcc will fix any incorrect value in vccz. 1665 VCCZCorrect = true; 1666 } 1667 } 1668 } 1669 1670 if (TII->isSMRD(Inst)) { 1671 for (const MachineMemOperand *Memop : Inst.memoperands()) { 1672 // No need to handle invariant loads when avoiding WAR conflicts, as 1673 // there cannot be a vector store to the same memory location. 1674 if (!Memop->isInvariant()) { 1675 const Value *Ptr = Memop->getValue(); 1676 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent())); 1677 } 1678 } 1679 if (ST->hasReadVCCZBug()) { 1680 // This smem read could complete and clobber vccz at any time. 1681 VCCZCorrect = false; 1682 } 1683 } 1684 1685 updateEventWaitcntAfter(Inst, &ScoreBrackets); 1686 1687 #if 0 // TODO: implement resource type check controlled by options with ub = LB. 1688 // If this instruction generates a S_SETVSKIP because it is an 1689 // indexed resource, and we are on Tahiti, then it will also force 1690 // an S_WAITCNT vmcnt(0) 1691 if (RequireCheckResourceType(Inst, context)) { 1692 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. 1693 ScoreBrackets->setScoreLB(VM_CNT, 1694 ScoreBrackets->getScoreUB(VM_CNT)); 1695 } 1696 #endif 1697 1698 LLVM_DEBUG({ 1699 Inst.print(dbgs()); 1700 ScoreBrackets.dump(); 1701 }); 1702 1703 // TODO: Remove this work-around after fixing the scheduler and enable the 1704 // assert above. 1705 if (RestoreVCCZ) { 1706 // Restore the vccz bit. Any time a value is written to vcc, the vcc 1707 // bit is updated, so we can restore the bit by reading the value of 1708 // vcc and then writing it back to the register. 1709 BuildMI(Block, Inst, Inst.getDebugLoc(), 1710 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), 1711 TRI->getVCC()) 1712 .addReg(TRI->getVCC()); 1713 VCCZCorrect = true; 1714 Modified = true; 1715 } 1716 1717 ++Iter; 1718 } 1719 1720 if (Block.getFirstTerminator() == Block.end() && 1721 isPreheaderToFlush(Block, ScoreBrackets)) 1722 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); 1723 1724 return Modified; 1725 } 1726 1727 // Return true if the given machine basic block is a preheader of a loop in 1728 // which we want to flush the vmcnt counter, and false otherwise. 1729 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, 1730 WaitcntBrackets &ScoreBrackets) { 1731 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); 1732 if (!IsInserted) 1733 return Iterator->second; 1734 1735 MachineBasicBlock *Succ = MBB.getSingleSuccessor(); 1736 if (!Succ) 1737 return false; 1738 1739 MachineLoop *Loop = MLI->getLoopFor(Succ); 1740 if (!Loop) 1741 return false; 1742 1743 if (Loop->getLoopPreheader() == &MBB && 1744 shouldFlushVmCnt(Loop, ScoreBrackets)) { 1745 Iterator->second = true; 1746 return true; 1747 } 1748 1749 return false; 1750 } 1751 1752 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { 1753 return SIInstrInfo::isVMEM(MI) || 1754 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); 1755 } 1756 1757 // Return true if it is better to flush the vmcnt counter in the preheader of 1758 // the given loop. We currently decide to flush in two situations: 1759 // 1. The loop contains vmem store(s), no vmem load and at least one use of a 1760 // vgpr containing a value that is loaded outside of the loop. (Only on 1761 // targets with no vscnt counter). 1762 // 2. The loop contains vmem load(s), but the loaded values are not used in the 1763 // loop, and at least one use of a vgpr containing a value that is loaded 1764 // outside of the loop. 1765 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, 1766 WaitcntBrackets &Brackets) { 1767 bool HasVMemLoad = false; 1768 bool HasVMemStore = false; 1769 bool UsesVgprLoadedOutside = false; 1770 DenseSet<Register> VgprUse; 1771 DenseSet<Register> VgprDef; 1772 1773 for (MachineBasicBlock *MBB : ML->blocks()) { 1774 for (MachineInstr &MI : *MBB) { 1775 if (isVMEMOrFlatVMEM(MI)) { 1776 if (MI.mayLoad()) 1777 HasVMemLoad = true; 1778 if (MI.mayStore()) 1779 HasVMemStore = true; 1780 } 1781 for (unsigned I = 0; I < MI.getNumOperands(); I++) { 1782 MachineOperand &Op = MI.getOperand(I); 1783 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) 1784 continue; 1785 RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); 1786 // Vgpr use 1787 if (Op.isUse()) { 1788 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1789 // If we find a register that is loaded inside the loop, 1. and 2. 1790 // are invalidated and we can exit. 1791 if (VgprDef.contains(RegNo)) 1792 return false; 1793 VgprUse.insert(RegNo); 1794 // If at least one of Op's registers is in the score brackets, the 1795 // value is likely loaded outside of the loop. 1796 if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { 1797 UsesVgprLoadedOutside = true; 1798 break; 1799 } 1800 } 1801 } 1802 // VMem load vgpr def 1803 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) 1804 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { 1805 // If we find a register that is loaded inside the loop, 1. and 2. 1806 // are invalidated and we can exit. 1807 if (VgprUse.contains(RegNo)) 1808 return false; 1809 VgprDef.insert(RegNo); 1810 } 1811 } 1812 } 1813 } 1814 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) 1815 return true; 1816 return HasVMemLoad && UsesVgprLoadedOutside; 1817 } 1818 1819 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { 1820 ST = &MF.getSubtarget<GCNSubtarget>(); 1821 TII = ST->getInstrInfo(); 1822 TRI = &TII->getRegisterInfo(); 1823 MRI = &MF.getRegInfo(); 1824 IV = AMDGPU::getIsaVersion(ST->getCPU()); 1825 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1826 MLI = &getAnalysis<MachineLoopInfo>(); 1827 PDT = &getAnalysis<MachinePostDominatorTree>(); 1828 1829 ForceEmitZeroWaitcnts = ForceEmitZeroFlag; 1830 for (auto T : inst_counter_types()) 1831 ForceEmitWaitcnt[T] = false; 1832 1833 OptNone = MF.getFunction().hasOptNone() || 1834 MF.getTarget().getOptLevel() == CodeGenOptLevel::None; 1835 1836 HardwareLimits Limits = {}; 1837 Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); 1838 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); 1839 Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); 1840 Limits.VscntMax = ST->hasVscnt() ? 63 : 0; 1841 1842 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); 1843 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); 1844 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); 1845 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); 1846 1847 RegisterEncoding Encoding = {}; 1848 Encoding.VGPR0 = 1849 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 1850 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; 1851 Encoding.SGPR0 = 1852 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; 1853 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; 1854 1855 BlockInfos.clear(); 1856 bool Modified = false; 1857 1858 if (!MFI->isEntryFunction()) { 1859 // Wait for any outstanding memory operations that the input registers may 1860 // depend on. We can't track them and it's better to do the wait after the 1861 // costly call sequence. 1862 1863 // TODO: Could insert earlier and schedule more liberally with operations 1864 // that only use caller preserved registers. 1865 MachineBasicBlock &EntryBB = MF.front(); 1866 MachineBasicBlock::iterator I = EntryBB.begin(); 1867 for (MachineBasicBlock::iterator E = EntryBB.end(); 1868 I != E && (I->isPHI() || I->isMetaInstruction()); ++I) 1869 ; 1870 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); 1871 1872 auto NonKernelInitialState = 1873 std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); 1874 NonKernelInitialState->setNonKernelFunctionInitialState(); 1875 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); 1876 1877 Modified = true; 1878 } 1879 1880 // Keep iterating over the blocks in reverse post order, inserting and 1881 // updating s_waitcnt where needed, until a fix point is reached. 1882 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) 1883 BlockInfos.insert({MBB, BlockInfo()}); 1884 1885 std::unique_ptr<WaitcntBrackets> Brackets; 1886 bool Repeat; 1887 do { 1888 Repeat = false; 1889 1890 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; 1891 ++BII) { 1892 MachineBasicBlock *MBB = BII->first; 1893 BlockInfo &BI = BII->second; 1894 if (!BI.Dirty) 1895 continue; 1896 1897 if (BI.Incoming) { 1898 if (!Brackets) 1899 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); 1900 else 1901 *Brackets = *BI.Incoming; 1902 } else { 1903 if (!Brackets) 1904 Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); 1905 else 1906 *Brackets = WaitcntBrackets(ST, Limits, Encoding); 1907 } 1908 1909 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); 1910 BI.Dirty = false; 1911 1912 if (Brackets->hasPendingEvent()) { 1913 BlockInfo *MoveBracketsToSucc = nullptr; 1914 for (MachineBasicBlock *Succ : MBB->successors()) { 1915 auto SuccBII = BlockInfos.find(Succ); 1916 BlockInfo &SuccBI = SuccBII->second; 1917 if (!SuccBI.Incoming) { 1918 SuccBI.Dirty = true; 1919 if (SuccBII <= BII) 1920 Repeat = true; 1921 if (!MoveBracketsToSucc) { 1922 MoveBracketsToSucc = &SuccBI; 1923 } else { 1924 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); 1925 } 1926 } else if (SuccBI.Incoming->merge(*Brackets)) { 1927 SuccBI.Dirty = true; 1928 if (SuccBII <= BII) 1929 Repeat = true; 1930 } 1931 } 1932 if (MoveBracketsToSucc) 1933 MoveBracketsToSucc->Incoming = std::move(Brackets); 1934 } 1935 } 1936 } while (Repeat); 1937 1938 if (ST->hasScalarStores()) { 1939 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; 1940 bool HaveScalarStores = false; 1941 1942 for (MachineBasicBlock &MBB : MF) { 1943 for (MachineInstr &MI : MBB) { 1944 if (!HaveScalarStores && TII->isScalarStore(MI)) 1945 HaveScalarStores = true; 1946 1947 if (MI.getOpcode() == AMDGPU::S_ENDPGM || 1948 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) 1949 EndPgmBlocks.push_back(&MBB); 1950 } 1951 } 1952 1953 if (HaveScalarStores) { 1954 // If scalar writes are used, the cache must be flushed or else the next 1955 // wave to reuse the same scratch memory can be clobbered. 1956 // 1957 // Insert s_dcache_wb at wave termination points if there were any scalar 1958 // stores, and only if the cache hasn't already been flushed. This could 1959 // be improved by looking across blocks for flushes in postdominating 1960 // blocks from the stores but an explicitly requested flush is probably 1961 // very rare. 1962 for (MachineBasicBlock *MBB : EndPgmBlocks) { 1963 bool SeenDCacheWB = false; 1964 1965 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 1966 I != E; ++I) { 1967 if (I->getOpcode() == AMDGPU::S_DCACHE_WB) 1968 SeenDCacheWB = true; 1969 else if (TII->isScalarStore(*I)) 1970 SeenDCacheWB = false; 1971 1972 // FIXME: It would be better to insert this before a waitcnt if any. 1973 if ((I->getOpcode() == AMDGPU::S_ENDPGM || 1974 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && 1975 !SeenDCacheWB) { 1976 Modified = true; 1977 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); 1978 } 1979 } 1980 } 1981 } 1982 } 1983 1984 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM 1985 // instructions. 1986 for (MachineInstr *MI : ReleaseVGPRInsts) { 1987 if (ST->requiresNopBeforeDeallocVGPRs()) { 1988 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) 1989 .addImm(0); 1990 } 1991 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG)) 1992 .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); 1993 Modified = true; 1994 } 1995 ReleaseVGPRInsts.clear(); 1996 1997 return Modified; 1998 } 1999