xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/PostOrderIterator.h"
33 #include "llvm/ADT/Sequence.h"
34 #include "llvm/Analysis/AliasAnalysis.h"
35 #include "llvm/CodeGen/MachineLoopInfo.h"
36 #include "llvm/CodeGen/MachinePassManager.h"
37 #include "llvm/CodeGen/MachinePostDominators.h"
38 #include "llvm/Support/DebugCounter.h"
39 #include "llvm/TargetParser/TargetParser.h"
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "si-insert-waitcnts"
43 
44 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
45               "Force emit s_waitcnt expcnt(0) instrs");
46 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
47               "Force emit s_waitcnt lgkmcnt(0) instrs");
48 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
49               "Force emit s_waitcnt vmcnt(0) instrs");
50 
51 static cl::opt<bool>
52     ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
53                       cl::desc("Force all waitcnt instrs to be emitted as "
54                                "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
55                       cl::init(false), cl::Hidden);
56 
57 static cl::opt<bool> ForceEmitZeroLoadFlag(
58     "amdgpu-waitcnt-load-forcezero",
59     cl::desc("Force all waitcnt load counters to wait until 0"),
60     cl::init(false), cl::Hidden);
61 
62 namespace {
63 // Class of object that encapsulates latest instruction counter score
64 // associated with the operand.  Used for determining whether
65 // s_waitcnt instruction needs to be emitted.
66 
67 enum InstCounterType {
68   LOAD_CNT = 0, // VMcnt prior to gfx12.
69   DS_CNT,       // LKGMcnt prior to gfx12.
70   EXP_CNT,      //
71   STORE_CNT,    // VScnt in gfx10/gfx11.
72   NUM_NORMAL_INST_CNTS,
73   SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
74   BVH_CNT,                           // gfx12+ only.
75   KM_CNT,                            // gfx12+ only.
76   X_CNT,                             // gfx1250.
77   NUM_EXTENDED_INST_CNTS,
78   NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
79 };
80 } // namespace
81 
82 namespace llvm {
83 template <> struct enum_iteration_traits<InstCounterType> {
84   static constexpr bool is_iterable = true;
85 };
86 } // namespace llvm
87 
88 namespace {
89 // Return an iterator over all counters between LOAD_CNT (the first counter)
90 // and \c MaxCounter (exclusive, default value yields an enumeration over
91 // all counters).
inst_counter_types(InstCounterType MaxCounter=NUM_INST_CNTS)92 auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
93   return enum_seq(LOAD_CNT, MaxCounter);
94 }
95 
96 using RegInterval = std::pair<int, int>;
97 
98 struct HardwareLimits {
99   unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
100   unsigned ExpcntMax;
101   unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
102   unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
103   unsigned SamplecntMax; // gfx12+ only.
104   unsigned BvhcntMax;    // gfx12+ only.
105   unsigned KmcntMax;     // gfx12+ only.
106   unsigned XcntMax;      // gfx1250.
107 };
108 
109 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
110   DECL(VMEM_ACCESS)              /* vmem read & write */                       \
111   DECL(VMEM_READ_ACCESS)         /* vmem read */                               \
112   DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \
113   DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
114   DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
115   DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
116   DECL(VMEM_GROUP)               /* vmem group */                              \
117   DECL(LDS_ACCESS)               /* lds read & write */                        \
118   DECL(GDS_ACCESS)               /* gds read & write */                        \
119   DECL(SQ_MESSAGE)               /* send message */                            \
120   DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
121   DECL(SMEM_GROUP)               /* scalar-memory group */                     \
122   DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
123   DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
124   DECL(EXP_POS_ACCESS)           /* write to export position */                \
125   DECL(EXP_PARAM_ACCESS)         /* write to export parameter */               \
126   DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */      \
127   DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */
128 
129 // clang-format off
130 #define AMDGPU_EVENT_ENUM(Name) Name,
131 enum WaitEventType {
132   AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
133   NUM_WAIT_EVENTS
134 };
135 #undef AMDGPU_EVENT_ENUM
136 
137 #define AMDGPU_EVENT_NAME(Name) #Name,
138 static constexpr StringLiteral WaitEventTypeName[] = {
139   AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
140 };
141 #undef AMDGPU_EVENT_NAME
142 // clang-format on
143 
144 // The mapping is:
145 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
146 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
147 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
148 // We reserve a fixed number of VGPR slots in the scoring tables for
149 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
150 enum RegisterMapping {
151   SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
152   AGPR_OFFSET = 512,       // Maximum programmable ArchVGPRs across all targets.
153   SQ_MAX_PGM_SGPRS = 128,  // Maximum programmable SGPRs across all targets.
154   // Artificial register slots to track LDS writes into specific LDS locations
155   // if a location is known. When slots are exhausted or location is
156   // unknown use the first slot. The first slot is also always updated in
157   // addition to known location's slot to properly generate waits if dependent
158   // instruction's location is unknown.
159   FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
160   NUM_LDS_VGPRS = 9,                 // One more than the stores we track.
161   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
162 };
163 
164 // Enumerate different types of result-returning VMEM operations. Although
165 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
166 // s_waitcnt only instructions of the same VmemType are guaranteed to write
167 // their results in order -- so there is no need to insert an s_waitcnt between
168 // two instructions of the same type that write the same vgpr.
169 enum VmemType {
170   // BUF instructions and MIMG instructions without a sampler.
171   VMEM_NOSAMPLER,
172   // MIMG instructions with a sampler.
173   VMEM_SAMPLER,
174   // BVH instructions
175   VMEM_BVH,
176   NUM_VMEM_TYPES
177 };
178 
179 // Maps values of InstCounterType to the instruction that waits on that
180 // counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
181 // returns true.
182 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
183     AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
184     AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
185     AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};
186 
updateVMCntOnly(const MachineInstr & Inst)187 static bool updateVMCntOnly(const MachineInstr &Inst) {
188   return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
189          SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst);
190 }
191 
192 #ifndef NDEBUG
isNormalMode(InstCounterType MaxCounter)193 static bool isNormalMode(InstCounterType MaxCounter) {
194   return MaxCounter == NUM_NORMAL_INST_CNTS;
195 }
196 #endif // NDEBUG
197 
getVmemType(const MachineInstr & Inst)198 VmemType getVmemType(const MachineInstr &Inst) {
199   assert(updateVMCntOnly(Inst));
200   if (!SIInstrInfo::isImage(Inst))
201     return VMEM_NOSAMPLER;
202   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
203   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
204       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
205 
206   if (BaseInfo->BVH)
207     return VMEM_BVH;
208 
209   // We have to make an additional check for isVSAMPLE here since some
210   // instructions don't have a sampler, but are still classified as sampler
211   // instructions for the purposes of e.g. waitcnt.
212   if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
213     return VMEM_SAMPLER;
214 
215   return VMEM_NOSAMPLER;
216 }
217 
getCounterRef(AMDGPU::Waitcnt & Wait,InstCounterType T)218 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
219   switch (T) {
220   case LOAD_CNT:
221     return Wait.LoadCnt;
222   case EXP_CNT:
223     return Wait.ExpCnt;
224   case DS_CNT:
225     return Wait.DsCnt;
226   case STORE_CNT:
227     return Wait.StoreCnt;
228   case SAMPLE_CNT:
229     return Wait.SampleCnt;
230   case BVH_CNT:
231     return Wait.BvhCnt;
232   case KM_CNT:
233     return Wait.KmCnt;
234   case X_CNT:
235     return Wait.XCnt;
236   default:
237     llvm_unreachable("bad InstCounterType");
238   }
239 }
240 
addWait(AMDGPU::Waitcnt & Wait,InstCounterType T,unsigned Count)241 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
242   unsigned &WC = getCounterRef(Wait, T);
243   WC = std::min(WC, Count);
244 }
245 
setNoWait(AMDGPU::Waitcnt & Wait,InstCounterType T)246 void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
247   getCounterRef(Wait, T) = ~0u;
248 }
249 
getWait(AMDGPU::Waitcnt & Wait,InstCounterType T)250 unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
251   return getCounterRef(Wait, T);
252 }
253 
254 // Mapping from event to counter according to the table masks.
eventCounter(const unsigned * masks,WaitEventType E)255 InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
256   for (auto T : inst_counter_types()) {
257     if (masks[T] & (1 << E))
258       return T;
259   }
260   llvm_unreachable("event type has no associated counter");
261 }
262 
263 // This objects maintains the current score brackets of each wait counter, and
264 // a per-register scoreboard for each wait counter.
265 //
266 // We also maintain the latest score for every event type that can change the
267 // waitcnt in order to know if there are multiple types of events within
268 // the brackets. When multiple types of event happen in the bracket,
269 // wait count may get decreased out of order, therefore we need to put in
270 // "s_waitcnt 0" before use.
271 class WaitcntBrackets {
272 public:
WaitcntBrackets(const GCNSubtarget * SubTarget,InstCounterType MaxCounter,HardwareLimits Limits,const unsigned * WaitEventMaskForInst,InstCounterType SmemAccessCounter)273   WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
274                   HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
275                   InstCounterType SmemAccessCounter)
276       : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
277         WaitEventMaskForInst(WaitEventMaskForInst),
278         SmemAccessCounter(SmemAccessCounter) {}
279 
getWaitCountMax(InstCounterType T) const280   unsigned getWaitCountMax(InstCounterType T) const {
281     switch (T) {
282     case LOAD_CNT:
283       return Limits.LoadcntMax;
284     case DS_CNT:
285       return Limits.DscntMax;
286     case EXP_CNT:
287       return Limits.ExpcntMax;
288     case STORE_CNT:
289       return Limits.StorecntMax;
290     case SAMPLE_CNT:
291       return Limits.SamplecntMax;
292     case BVH_CNT:
293       return Limits.BvhcntMax;
294     case KM_CNT:
295       return Limits.KmcntMax;
296     case X_CNT:
297       return Limits.XcntMax;
298     default:
299       break;
300     }
301     return 0;
302   }
303 
isSmemCounter(InstCounterType T) const304   bool isSmemCounter(InstCounterType T) const {
305     return T == SmemAccessCounter || T == X_CNT;
306   }
307 
getSgprScoresIdx(InstCounterType T) const308   unsigned getSgprScoresIdx(InstCounterType T) const {
309     assert(isSmemCounter(T) && "Invalid SMEM counter");
310     return T == X_CNT ? 1 : 0;
311   }
312 
getScoreLB(InstCounterType T) const313   unsigned getScoreLB(InstCounterType T) const {
314     assert(T < NUM_INST_CNTS);
315     return ScoreLBs[T];
316   }
317 
getScoreUB(InstCounterType T) const318   unsigned getScoreUB(InstCounterType T) const {
319     assert(T < NUM_INST_CNTS);
320     return ScoreUBs[T];
321   }
322 
getScoreRange(InstCounterType T) const323   unsigned getScoreRange(InstCounterType T) const {
324     return getScoreUB(T) - getScoreLB(T);
325   }
326 
getRegScore(int GprNo,InstCounterType T) const327   unsigned getRegScore(int GprNo, InstCounterType T) const {
328     if (GprNo < NUM_ALL_VGPRS)
329       return VgprScores[T][GprNo];
330     return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
331   }
332 
333   bool merge(const WaitcntBrackets &Other);
334 
335   RegInterval getRegInterval(const MachineInstr *MI,
336                              const MachineRegisterInfo *MRI,
337                              const SIRegisterInfo *TRI,
338                              const MachineOperand &Op) const;
339 
340   bool counterOutOfOrder(InstCounterType T) const;
341   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
342   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
343 
344   void determineWait(InstCounterType T, RegInterval Interval,
345                      AMDGPU::Waitcnt &Wait) const;
determineWait(InstCounterType T,int RegNo,AMDGPU::Waitcnt & Wait) const346   void determineWait(InstCounterType T, int RegNo,
347                      AMDGPU::Waitcnt &Wait) const {
348     determineWait(T, {RegNo, RegNo + 1}, Wait);
349   }
350 
351   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
352   void applyWaitcnt(InstCounterType T, unsigned Count);
353   void applyXcnt(const AMDGPU::Waitcnt &Wait);
354   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
355                      const MachineRegisterInfo *MRI, WaitEventType E,
356                      MachineInstr &MI);
357 
hasPendingEvent() const358   unsigned hasPendingEvent() const { return PendingEvents; }
hasPendingEvent(WaitEventType E) const359   unsigned hasPendingEvent(WaitEventType E) const {
360     return PendingEvents & (1 << E);
361   }
hasPendingEvent(InstCounterType T) const362   unsigned hasPendingEvent(InstCounterType T) const {
363     unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
364     assert((HasPending != 0) == (getScoreRange(T) != 0));
365     return HasPending;
366   }
367 
hasMixedPendingEvents(InstCounterType T) const368   bool hasMixedPendingEvents(InstCounterType T) const {
369     unsigned Events = hasPendingEvent(T);
370     // Return true if more than one bit is set in Events.
371     return Events & (Events - 1);
372   }
373 
hasPendingFlat() const374   bool hasPendingFlat() const {
375     return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
376              LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
377             (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
378              LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
379   }
380 
setPendingFlat()381   void setPendingFlat() {
382     LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
383     LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
384   }
385 
hasPendingGDS() const386   bool hasPendingGDS() const {
387     return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
388   }
389 
getPendingGDSWait() const390   unsigned getPendingGDSWait() const {
391     return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
392   }
393 
setPendingGDS()394   void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
395 
396   // Return true if there might be pending writes to the vgpr-interval by VMEM
397   // instructions with types different from V.
hasOtherPendingVmemTypes(RegInterval Interval,VmemType V) const398   bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
399     for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
400       assert(RegNo < NUM_ALL_VGPRS);
401       if (VgprVmemTypes[RegNo] & ~(1 << V))
402         return true;
403     }
404     return false;
405   }
406 
clearVgprVmemTypes(RegInterval Interval)407   void clearVgprVmemTypes(RegInterval Interval) {
408     for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
409       assert(RegNo < NUM_ALL_VGPRS);
410       VgprVmemTypes[RegNo] = 0;
411     }
412   }
413 
setStateOnFunctionEntryOrReturn()414   void setStateOnFunctionEntryOrReturn() {
415     setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
416     PendingEvents |= WaitEventMaskForInst[STORE_CNT];
417   }
418 
getLDSDMAStores() const419   ArrayRef<const MachineInstr *> getLDSDMAStores() const {
420     return LDSDMAStores;
421   }
422 
423   bool hasPointSampleAccel(const MachineInstr &MI) const;
424   bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
425                                       RegInterval Interval) const;
426 
427   void print(raw_ostream &) const;
dump() const428   void dump() const { print(dbgs()); }
429 
430 private:
431   struct MergeInfo {
432     unsigned OldLB;
433     unsigned OtherLB;
434     unsigned MyShift;
435     unsigned OtherShift;
436   };
437   static bool mergeScore(const MergeInfo &M, unsigned &Score,
438                          unsigned OtherScore);
439 
setScoreLB(InstCounterType T,unsigned Val)440   void setScoreLB(InstCounterType T, unsigned Val) {
441     assert(T < NUM_INST_CNTS);
442     ScoreLBs[T] = Val;
443   }
444 
setScoreUB(InstCounterType T,unsigned Val)445   void setScoreUB(InstCounterType T, unsigned Val) {
446     assert(T < NUM_INST_CNTS);
447     ScoreUBs[T] = Val;
448 
449     if (T != EXP_CNT)
450       return;
451 
452     if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
453       ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
454   }
455 
setRegScore(int GprNo,InstCounterType T,unsigned Val)456   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
457     setScoreByInterval({GprNo, GprNo + 1}, T, Val);
458   }
459 
460   void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
461                           unsigned Score);
462 
463   void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
464                          const MachineRegisterInfo *MRI,
465                          const MachineOperand &Op, InstCounterType CntTy,
466                          unsigned Val);
467 
468   const GCNSubtarget *ST = nullptr;
469   InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
470   HardwareLimits Limits = {};
471   const unsigned *WaitEventMaskForInst;
472   InstCounterType SmemAccessCounter;
473   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
474   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
475   unsigned PendingEvents = 0;
476   // Remember the last flat memory operation.
477   unsigned LastFlat[NUM_INST_CNTS] = {0};
478   // Remember the last GDS operation.
479   unsigned LastGDS = 0;
480   // wait_cnt scores for every vgpr.
481   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
482   int VgprUB = -1;
483   int SgprUB = -1;
484   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
485   // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
486   // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
487   // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
488   // X_CNT score.
489   unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
490   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
491   // write to each vgpr.
492   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
493   // Store representative LDS DMA operations. The only useful info here is
494   // alias info. One store is kept per unique AAInfo.
495   SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
496 };
497 
498 // This abstracts the logic for generating and updating S_WAIT* instructions
499 // away from the analysis that determines where they are needed. This was
500 // done because the set of counters and instructions for waiting on them
501 // underwent a major shift with gfx12, sufficiently so that having this
502 // abstraction allows the main analysis logic to be simpler than it would
503 // otherwise have had to become.
504 class WaitcntGenerator {
505 protected:
506   const GCNSubtarget *ST = nullptr;
507   const SIInstrInfo *TII = nullptr;
508   AMDGPU::IsaVersion IV;
509   InstCounterType MaxCounter;
510   bool OptNone;
511 
512 public:
513   WaitcntGenerator() = default;
WaitcntGenerator(const MachineFunction & MF,InstCounterType MaxCounter)514   WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
515       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
516         IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
517         OptNone(MF.getFunction().hasOptNone() ||
518                 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
519 
520   // Return true if the current function should be compiled with no
521   // optimization.
isOptNone() const522   bool isOptNone() const { return OptNone; }
523 
524   // Edits an existing sequence of wait count instructions according
525   // to an incoming Waitcnt value, which is itself updated to reflect
526   // any new wait count instructions which may need to be generated by
527   // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
528   // were made.
529   //
530   // This editing will usually be merely updated operands, but it may also
531   // delete instructions if the incoming Wait value indicates they are not
532   // needed. It may also remove existing instructions for which a wait
533   // is needed if it can be determined that it is better to generate new
534   // instructions later, as can happen on gfx12.
535   virtual bool
536   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
537                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
538                           MachineBasicBlock::instr_iterator It) const = 0;
539 
540   // Transform a soft waitcnt into a normal one.
541   bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
542 
543   // Generates new wait count instructions according to the  value of
544   // Wait, returning true if any new instructions were created.
545   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
546                                 MachineBasicBlock::instr_iterator It,
547                                 AMDGPU::Waitcnt Wait) = 0;
548 
549   // Returns an array of bit masks which can be used to map values in
550   // WaitEventType to corresponding counter values in InstCounterType.
551   virtual const unsigned *getWaitEventMask() const = 0;
552 
553   // Returns a new waitcnt with all counters except VScnt set to 0. If
554   // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
555   virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
556 
557   virtual ~WaitcntGenerator() = default;
558 
559   // Create a mask value from the initializer list of wait event types.
560   static constexpr unsigned
eventMask(std::initializer_list<WaitEventType> Events)561   eventMask(std::initializer_list<WaitEventType> Events) {
562     unsigned Mask = 0;
563     for (auto &E : Events)
564       Mask |= 1 << E;
565 
566     return Mask;
567   }
568 };
569 
570 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
571 public:
572   WaitcntGeneratorPreGFX12() = default;
WaitcntGeneratorPreGFX12(const MachineFunction & MF)573   WaitcntGeneratorPreGFX12(const MachineFunction &MF)
574       : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
575 
576   bool
577   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
578                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
579                           MachineBasicBlock::instr_iterator It) const override;
580 
581   bool createNewWaitcnt(MachineBasicBlock &Block,
582                         MachineBasicBlock::instr_iterator It,
583                         AMDGPU::Waitcnt Wait) override;
584 
getWaitEventMask() const585   const unsigned *getWaitEventMask() const override {
586     assert(ST);
587 
588     static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
589         eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
590                    VMEM_BVH_READ_ACCESS}),
591         eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
592         eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
593                    EXP_POS_ACCESS, EXP_LDS_ACCESS}),
594         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
595         0,
596         0,
597         0,
598         0};
599 
600     return WaitEventMaskForInstPreGFX12;
601   }
602 
603   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
604 };
605 
606 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
607 public:
608   WaitcntGeneratorGFX12Plus() = default;
WaitcntGeneratorGFX12Plus(const MachineFunction & MF,InstCounterType MaxCounter)609   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
610                             InstCounterType MaxCounter)
611       : WaitcntGenerator(MF, MaxCounter) {}
612 
613   bool
614   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
615                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
616                           MachineBasicBlock::instr_iterator It) const override;
617 
618   bool createNewWaitcnt(MachineBasicBlock &Block,
619                         MachineBasicBlock::instr_iterator It,
620                         AMDGPU::Waitcnt Wait) override;
621 
getWaitEventMask() const622   const unsigned *getWaitEventMask() const override {
623     assert(ST);
624 
625     static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
626         eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
627         eventMask({LDS_ACCESS, GDS_ACCESS}),
628         eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
629                    EXP_POS_ACCESS, EXP_LDS_ACCESS}),
630         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
631         eventMask({VMEM_SAMPLER_READ_ACCESS}),
632         eventMask({VMEM_BVH_READ_ACCESS}),
633         eventMask({SMEM_ACCESS, SQ_MESSAGE}),
634         eventMask({VMEM_GROUP, SMEM_GROUP})};
635 
636     return WaitEventMaskForInstGFX12Plus;
637   }
638 
639   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
640 };
641 
642 class SIInsertWaitcnts {
643 private:
644   const GCNSubtarget *ST = nullptr;
645   const SIInstrInfo *TII = nullptr;
646   const SIRegisterInfo *TRI = nullptr;
647   const MachineRegisterInfo *MRI = nullptr;
648 
649   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
650   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
651   MachineLoopInfo *MLI;
652   MachinePostDominatorTree *PDT;
653   AliasAnalysis *AA = nullptr;
654 
655   struct BlockInfo {
656     std::unique_ptr<WaitcntBrackets> Incoming;
657     bool Dirty = true;
658   };
659 
660   InstCounterType SmemAccessCounter;
661 
662   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
663 
664   bool ForceEmitWaitcnt[NUM_INST_CNTS];
665 
666   // In any given run of this pass, WCG will point to one of these two
667   // generator objects, which must have been re-initialised before use
668   // from a value made using a subtarget constructor.
669   WaitcntGeneratorPreGFX12 WCGPreGFX12;
670   WaitcntGeneratorGFX12Plus WCGGFX12Plus;
671 
672   WaitcntGenerator *WCG = nullptr;
673 
674   // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
675   // message.
676   DenseSet<MachineInstr *> ReleaseVGPRInsts;
677 
678   InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
679 
680 public:
SIInsertWaitcnts(MachineLoopInfo * MLI,MachinePostDominatorTree * PDT,AliasAnalysis * AA)681   SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
682                    AliasAnalysis *AA)
683       : MLI(MLI), PDT(PDT), AA(AA) {
684     (void)ForceExpCounter;
685     (void)ForceLgkmCounter;
686     (void)ForceVMCounter;
687   }
688 
689   bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
690   bool isPreheaderToFlush(MachineBasicBlock &MBB,
691                           const WaitcntBrackets &ScoreBrackets);
692   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
693   bool run(MachineFunction &MF);
694 
isForceEmitWaitcnt() const695   bool isForceEmitWaitcnt() const {
696     for (auto T : inst_counter_types())
697       if (ForceEmitWaitcnt[T])
698         return true;
699     return false;
700   }
701 
setForceEmitWaitcnt()702   void setForceEmitWaitcnt() {
703 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
704 // For debug builds, get the debug counter info and adjust if need be
705 #ifndef NDEBUG
706     if (DebugCounter::isCounterSet(ForceExpCounter) &&
707         DebugCounter::shouldExecute(ForceExpCounter)) {
708       ForceEmitWaitcnt[EXP_CNT] = true;
709     } else {
710       ForceEmitWaitcnt[EXP_CNT] = false;
711     }
712 
713     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
714         DebugCounter::shouldExecute(ForceLgkmCounter)) {
715       ForceEmitWaitcnt[DS_CNT] = true;
716       ForceEmitWaitcnt[KM_CNT] = true;
717     } else {
718       ForceEmitWaitcnt[DS_CNT] = false;
719       ForceEmitWaitcnt[KM_CNT] = false;
720     }
721 
722     if (DebugCounter::isCounterSet(ForceVMCounter) &&
723         DebugCounter::shouldExecute(ForceVMCounter)) {
724       ForceEmitWaitcnt[LOAD_CNT] = true;
725       ForceEmitWaitcnt[SAMPLE_CNT] = true;
726       ForceEmitWaitcnt[BVH_CNT] = true;
727     } else {
728       ForceEmitWaitcnt[LOAD_CNT] = false;
729       ForceEmitWaitcnt[SAMPLE_CNT] = false;
730       ForceEmitWaitcnt[BVH_CNT] = false;
731     }
732 #endif // NDEBUG
733   }
734 
735   // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
736   // instruction.
getVmemWaitEventType(const MachineInstr & Inst) const737   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
738     switch (Inst.getOpcode()) {
739     case AMDGPU::GLOBAL_INV:
740       return VMEM_READ_ACCESS; // tracked using loadcnt
741     case AMDGPU::GLOBAL_WB:
742     case AMDGPU::GLOBAL_WBINV:
743       return VMEM_WRITE_ACCESS; // tracked using storecnt
744     default:
745       break;
746     }
747 
748     // Maps VMEM access types to their corresponding WaitEventType.
749     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
750         VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
751 
752     assert(SIInstrInfo::isVMEM(Inst));
753     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
754     // these should use VM_CNT.
755     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
756       return VMEM_ACCESS;
757     if (Inst.mayStore() &&
758         (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
759       // FLAT and SCRATCH instructions may access scratch. Other VMEM
760       // instructions do not.
761       if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
762         return SCRATCH_WRITE_ACCESS;
763       return VMEM_WRITE_ACCESS;
764     }
765     if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
766       return VMEM_READ_ACCESS;
767     return VmemReadMapping[getVmemType(Inst)];
768   }
769 
hasXcnt() const770   bool hasXcnt() const { return ST->hasWaitXCnt(); }
771 
772   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
773   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
774   bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
775   bool isVmemAccess(const MachineInstr &MI) const;
776   bool generateWaitcntInstBefore(MachineInstr &MI,
777                                  WaitcntBrackets &ScoreBrackets,
778                                  MachineInstr *OldWaitcntInstr,
779                                  bool FlushVmCnt);
780   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
781                        MachineBasicBlock::instr_iterator It,
782                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
783                        MachineInstr *OldWaitcntInstr);
784   void updateEventWaitcntAfter(MachineInstr &Inst,
785                                WaitcntBrackets *ScoreBrackets);
786   bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
787                     MachineBasicBlock *Block) const;
788   bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
789                              WaitcntBrackets &ScoreBrackets);
790   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
791                             WaitcntBrackets &ScoreBrackets);
792 };
793 
794 class SIInsertWaitcntsLegacy : public MachineFunctionPass {
795 public:
796   static char ID;
SIInsertWaitcntsLegacy()797   SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
798 
799   bool runOnMachineFunction(MachineFunction &MF) override;
800 
getPassName() const801   StringRef getPassName() const override {
802     return "SI insert wait instructions";
803   }
804 
getAnalysisUsage(AnalysisUsage & AU) const805   void getAnalysisUsage(AnalysisUsage &AU) const override {
806     AU.setPreservesCFG();
807     AU.addRequired<MachineLoopInfoWrapperPass>();
808     AU.addRequired<MachinePostDominatorTreeWrapperPass>();
809     AU.addUsedIfAvailable<AAResultsWrapperPass>();
810     AU.addPreserved<AAResultsWrapperPass>();
811     MachineFunctionPass::getAnalysisUsage(AU);
812   }
813 };
814 
815 } // end anonymous namespace
816 
getRegInterval(const MachineInstr * MI,const MachineRegisterInfo * MRI,const SIRegisterInfo * TRI,const MachineOperand & Op) const817 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
818                                             const MachineRegisterInfo *MRI,
819                                             const SIRegisterInfo *TRI,
820                                             const MachineOperand &Op) const {
821   if (!TRI->isInAllocatableClass(Op.getReg()))
822     return {-1, -1};
823 
824   // A use via a PW operand does not need a waitcnt.
825   // A partial write is not a WAW.
826   assert(!Op.getSubReg() || !Op.isUndef());
827 
828   RegInterval Result;
829 
830   MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
831   unsigned RegIdx = TRI->getHWRegIndex(MCReg);
832   assert(isUInt<8>(RegIdx));
833 
834   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
835   unsigned Size = TRI->getRegSizeInBits(*RC);
836 
837   // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
838   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
839     unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
840     assert(Reg < AGPR_OFFSET);
841     Result.first = Reg;
842     if (TRI->isAGPR(*MRI, Op.getReg()))
843       Result.first += AGPR_OFFSET;
844     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
845     assert(Size % 16 == 0);
846     Result.second = Result.first + (Size / 16);
847   } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
848     // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
849     // sources like SRC_PRIVATE_BASE.
850     Result.first = RegIdx + NUM_ALL_VGPRS;
851     Result.second = Result.first + divideCeil(Size, 32);
852   } else {
853     return {-1, -1};
854   }
855 
856   return Result;
857 }
858 
setScoreByInterval(RegInterval Interval,InstCounterType CntTy,unsigned Score)859 void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
860                                          InstCounterType CntTy,
861                                          unsigned Score) {
862   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
863     if (RegNo < NUM_ALL_VGPRS) {
864       VgprUB = std::max(VgprUB, RegNo);
865       VgprScores[CntTy][RegNo] = Score;
866     } else {
867       SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
868       SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
869     }
870   }
871 }
872 
setScoreByOperand(const MachineInstr * MI,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,const MachineOperand & Op,InstCounterType CntTy,unsigned Score)873 void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
874                                         const SIRegisterInfo *TRI,
875                                         const MachineRegisterInfo *MRI,
876                                         const MachineOperand &Op,
877                                         InstCounterType CntTy, unsigned Score) {
878   RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
879   setScoreByInterval(Interval, CntTy, Score);
880 }
881 
882 // Return true if the subtarget is one that enables Point Sample Acceleration
883 // and the MachineInstr passed in is one to which it might be applied (the
884 // hardware makes this decision based on several factors, but we can't determine
885 // this at compile time, so we have to assume it might be applied if the
886 // instruction supports it).
hasPointSampleAccel(const MachineInstr & MI) const887 bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
888   if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
889     return false;
890 
891   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
892   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
893       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
894   return BaseInfo->PointSampleAccel;
895 }
896 
897 // Return true if the subtarget enables Point Sample Acceleration, the supplied
898 // MachineInstr is one to which it might be applied and the supplied interval is
899 // one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
900 // (this is the type that a point sample accelerated instruction effectively
901 // becomes)
hasPointSamplePendingVmemTypes(const MachineInstr & MI,RegInterval Interval) const902 bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
903     const MachineInstr &MI, RegInterval Interval) const {
904   if (!hasPointSampleAccel(MI))
905     return false;
906 
907   return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
908 }
909 
updateByEvent(const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,WaitEventType E,MachineInstr & Inst)910 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
911                                     const SIRegisterInfo *TRI,
912                                     const MachineRegisterInfo *MRI,
913                                     WaitEventType E, MachineInstr &Inst) {
914   InstCounterType T = eventCounter(WaitEventMaskForInst, E);
915 
916   unsigned UB = getScoreUB(T);
917   unsigned CurrScore = UB + 1;
918   if (CurrScore == 0)
919     report_fatal_error("InsertWaitcnt score wraparound");
920   // PendingEvents and ScoreUB need to be update regardless if this event
921   // changes the score of a register or not.
922   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
923   PendingEvents |= 1 << E;
924   setScoreUB(T, CurrScore);
925 
926   if (T == EXP_CNT) {
927     // Put score on the source vgprs. If this is a store, just use those
928     // specific register(s).
929     if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
930       // All GDS operations must protect their address register (same as
931       // export.)
932       if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
933         setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
934 
935       if (Inst.mayStore()) {
936         if (const auto *Data0 =
937                 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
938           setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
939         if (const auto *Data1 =
940                 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
941           setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
942       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
943                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
944                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
945                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
946         for (const MachineOperand &Op : Inst.all_uses()) {
947           if (TRI->isVectorRegister(*MRI, Op.getReg()))
948             setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
949         }
950       }
951     } else if (TII->isFLAT(Inst)) {
952       if (Inst.mayStore()) {
953         setScoreByOperand(&Inst, TRI, MRI,
954                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
955                           EXP_CNT, CurrScore);
956       } else if (SIInstrInfo::isAtomicRet(Inst)) {
957         setScoreByOperand(&Inst, TRI, MRI,
958                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
959                           EXP_CNT, CurrScore);
960       }
961     } else if (TII->isMIMG(Inst)) {
962       if (Inst.mayStore()) {
963         setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
964                           CurrScore);
965       } else if (SIInstrInfo::isAtomicRet(Inst)) {
966         setScoreByOperand(&Inst, TRI, MRI,
967                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
968                           EXP_CNT, CurrScore);
969       }
970     } else if (TII->isMTBUF(Inst)) {
971       if (Inst.mayStore())
972         setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
973                           CurrScore);
974     } else if (TII->isMUBUF(Inst)) {
975       if (Inst.mayStore()) {
976         setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
977                           CurrScore);
978       } else if (SIInstrInfo::isAtomicRet(Inst)) {
979         setScoreByOperand(&Inst, TRI, MRI,
980                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
981                           EXP_CNT, CurrScore);
982       }
983     } else if (TII->isLDSDIR(Inst)) {
984       // LDSDIR instructions attach the score to the destination.
985       setScoreByOperand(&Inst, TRI, MRI,
986                         *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
987                         EXP_CNT, CurrScore);
988     } else {
989       if (TII->isEXP(Inst)) {
990         // For export the destination registers are really temps that
991         // can be used as the actual source after export patching, so
992         // we need to treat them like sources and set the EXP_CNT
993         // score.
994         for (MachineOperand &DefMO : Inst.all_defs()) {
995           if (TRI->isVGPR(*MRI, DefMO.getReg())) {
996             setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
997           }
998         }
999       }
1000       for (const MachineOperand &Op : Inst.all_uses()) {
1001         if (TRI->isVectorRegister(*MRI, Op.getReg()))
1002           setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
1003       }
1004     }
1005   } else if (T == X_CNT) {
1006     for (const MachineOperand &Op : Inst.all_uses())
1007       setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
1008   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1009     // Match the score to the destination registers.
1010     //
1011     // Check only explicit operands. Stores, especially spill stores, include
1012     // implicit uses and defs of their super registers which would create an
1013     // artificial dependency, while these are there only for register liveness
1014     // accounting purposes.
1015     //
1016     // Special cases where implicit register defs exists, such as M0 or VCC,
1017     // but none with memory instructions.
1018     for (const MachineOperand &Op : Inst.defs()) {
1019       RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
1020       if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1021         if (Interval.first >= NUM_ALL_VGPRS)
1022           continue;
1023         if (updateVMCntOnly(Inst)) {
1024           // updateVMCntOnly should only leave us with VGPRs
1025           // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1026           // defs. That's required for a sane index into `VgprMemTypes` below
1027           assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1028           VmemType V = getVmemType(Inst);
1029           unsigned char TypesMask = 1 << V;
1030           // If instruction can have Point Sample Accel applied, we have to flag
1031           // this with another potential dependency
1032           if (hasPointSampleAccel(Inst))
1033             TypesMask |= 1 << VMEM_NOSAMPLER;
1034           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1035             VgprVmemTypes[RegNo] |= TypesMask;
1036         }
1037       }
1038       setScoreByInterval(Interval, T, CurrScore);
1039     }
1040     if (Inst.mayStore() &&
1041         (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1042       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1043       // written can be accessed. A load from LDS to VMEM does not need a wait.
1044       unsigned Slot = 0;
1045       for (const auto *MemOp : Inst.memoperands()) {
1046         if (!MemOp->isStore() ||
1047             MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1048           continue;
1049         // Comparing just AA info does not guarantee memoperands are equal
1050         // in general, but this is so for LDS DMA in practice.
1051         auto AAI = MemOp->getAAInfo();
1052         // Alias scope information gives a way to definitely identify an
1053         // original memory object and practically produced in the module LDS
1054         // lowering pass. If there is no scope available we will not be able
1055         // to disambiguate LDS aliasing as after the module lowering all LDS
1056         // is squashed into a single big object. Do not attempt to use one of
1057         // the limited LDSDMAStores for something we will not be able to use
1058         // anyway.
1059         if (!AAI || !AAI.Scope)
1060           break;
1061         for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1062           for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1063             if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1064               Slot = I + 1;
1065               break;
1066             }
1067           }
1068         }
1069         if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1070           break;
1071         LDSDMAStores.push_back(&Inst);
1072         Slot = LDSDMAStores.size();
1073         break;
1074       }
1075       setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
1076       if (Slot)
1077         setRegScore(FIRST_LDS_VGPR, T, CurrScore);
1078     }
1079   }
1080 }
1081 
print(raw_ostream & OS) const1082 void WaitcntBrackets::print(raw_ostream &OS) const {
1083   OS << '\n';
1084   for (auto T : inst_counter_types(MaxCounter)) {
1085     unsigned SR = getScoreRange(T);
1086 
1087     switch (T) {
1088     case LOAD_CNT:
1089       OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1090          << SR << "): ";
1091       break;
1092     case DS_CNT:
1093       OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1094          << SR << "): ";
1095       break;
1096     case EXP_CNT:
1097       OS << "    EXP_CNT(" << SR << "): ";
1098       break;
1099     case STORE_CNT:
1100       OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1101          << SR << "): ";
1102       break;
1103     case SAMPLE_CNT:
1104       OS << "    SAMPLE_CNT(" << SR << "): ";
1105       break;
1106     case BVH_CNT:
1107       OS << "    BVH_CNT(" << SR << "): ";
1108       break;
1109     case KM_CNT:
1110       OS << "    KM_CNT(" << SR << "): ";
1111       break;
1112     case X_CNT:
1113       OS << "    X_CNT(" << SR << "): ";
1114       break;
1115     default:
1116       OS << "    UNKNOWN(" << SR << "): ";
1117       break;
1118     }
1119 
1120     if (SR != 0) {
1121       // Print vgpr scores.
1122       unsigned LB = getScoreLB(T);
1123 
1124       for (int J = 0; J <= VgprUB; J++) {
1125         unsigned RegScore = getRegScore(J, T);
1126         if (RegScore <= LB)
1127           continue;
1128         unsigned RelScore = RegScore - LB - 1;
1129         if (J < FIRST_LDS_VGPR) {
1130           OS << RelScore << ":v" << J << " ";
1131         } else {
1132           OS << RelScore << ":ds ";
1133         }
1134       }
1135       // Also need to print sgpr scores for lgkm_cnt or xcnt.
1136       if (isSmemCounter(T)) {
1137         for (int J = 0; J <= SgprUB; J++) {
1138           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1139           if (RegScore <= LB)
1140             continue;
1141           unsigned RelScore = RegScore - LB - 1;
1142           OS << RelScore << ":s" << J << " ";
1143         }
1144       }
1145     }
1146     OS << '\n';
1147   }
1148 
1149   OS << "Pending Events: ";
1150   if (hasPendingEvent()) {
1151     ListSeparator LS;
1152     for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1153       if (hasPendingEvent((WaitEventType)I)) {
1154         OS << LS << WaitEventTypeName[I];
1155       }
1156     }
1157   } else {
1158     OS << "none";
1159   }
1160   OS << '\n';
1161 
1162   OS << '\n';
1163 }
1164 
1165 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
1166 /// whether a waitcnt instruction is needed at all.
simplifyWaitcnt(AMDGPU::Waitcnt & Wait) const1167 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1168   simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1169   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1170   simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1171   simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1172   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1173   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1174   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1175   simplifyWaitcnt(X_CNT, Wait.XCnt);
1176 }
1177 
simplifyWaitcnt(InstCounterType T,unsigned & Count) const1178 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1179                                       unsigned &Count) const {
1180   // The number of outstanding events for this type, T, can be calculated
1181   // as (UB - LB). If the current Count is greater than or equal to the number
1182   // of outstanding events, then the wait for this counter is redundant.
1183   if (Count >= getScoreRange(T))
1184     Count = ~0u;
1185 }
1186 
determineWait(InstCounterType T,RegInterval Interval,AMDGPU::Waitcnt & Wait) const1187 void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1188                                     AMDGPU::Waitcnt &Wait) const {
1189   const unsigned LB = getScoreLB(T);
1190   const unsigned UB = getScoreUB(T);
1191   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1192     unsigned ScoreToWait = getRegScore(RegNo, T);
1193 
1194     // If the score of src_operand falls within the bracket, we need an
1195     // s_waitcnt instruction.
1196     if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1197       if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1198           !ST->hasFlatLgkmVMemCountInOrder()) {
1199         // If there is a pending FLAT operation, and this is a VMem or LGKM
1200         // waitcnt and the target can report early completion, then we need
1201         // to force a waitcnt 0.
1202         addWait(Wait, T, 0);
1203       } else if (counterOutOfOrder(T)) {
1204         // Counter can get decremented out-of-order when there
1205         // are multiple types event in the bracket. Also emit an s_wait counter
1206         // with a conservative value of 0 for the counter.
1207         addWait(Wait, T, 0);
1208       } else {
1209         // If a counter has been maxed out avoid overflow by waiting for
1210         // MAX(CounterType) - 1 instead.
1211         unsigned NeededWait =
1212             std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1213         addWait(Wait, T, NeededWait);
1214       }
1215     }
1216   }
1217 }
1218 
applyWaitcnt(const AMDGPU::Waitcnt & Wait)1219 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1220   applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1221   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1222   applyWaitcnt(DS_CNT, Wait.DsCnt);
1223   applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1224   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1225   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1226   applyWaitcnt(KM_CNT, Wait.KmCnt);
1227   applyXcnt(Wait);
1228 }
1229 
applyWaitcnt(InstCounterType T,unsigned Count)1230 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1231   const unsigned UB = getScoreUB(T);
1232   if (Count >= UB)
1233     return;
1234   if (Count != 0) {
1235     if (counterOutOfOrder(T))
1236       return;
1237     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1238   } else {
1239     setScoreLB(T, UB);
1240     PendingEvents &= ~WaitEventMaskForInst[T];
1241   }
1242 }
1243 
applyXcnt(const AMDGPU::Waitcnt & Wait)1244 void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1245   // Wait on XCNT is redundant if we are already waiting for a load to complete.
1246   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1247   // zero.
1248   if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1249     return applyWaitcnt(X_CNT, 0);
1250 
1251   // If we have pending store we cannot optimize XCnt because we do not wait for
1252   // stores. VMEM loads retun in order, so if we only have loads XCnt is
1253   // decremented to the same number as LOADCnt.
1254   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1255       !hasPendingEvent(STORE_CNT))
1256     return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1257 
1258   applyWaitcnt(X_CNT, Wait.XCnt);
1259 }
1260 
1261 // Where there are multiple types of event in the bracket of a counter,
1262 // the decrement may go out of order.
counterOutOfOrder(InstCounterType T) const1263 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1264   // Scalar memory read always can go out of order.
1265   if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1266       (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1267     return true;
1268   return hasMixedPendingEvents(T);
1269 }
1270 
1271 INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1272                       false, false)
1273 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
1274 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
1275 INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1276                     false, false)
1277 
1278 char SIInsertWaitcntsLegacy::ID = 0;
1279 
1280 char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1281 
createSIInsertWaitcntsPass()1282 FunctionPass *llvm::createSIInsertWaitcntsPass() {
1283   return new SIInsertWaitcntsLegacy();
1284 }
1285 
updateOperandIfDifferent(MachineInstr & MI,AMDGPU::OpName OpName,unsigned NewEnc)1286 static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1287                                      unsigned NewEnc) {
1288   int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1289   assert(OpIdx >= 0);
1290 
1291   MachineOperand &MO = MI.getOperand(OpIdx);
1292 
1293   if (NewEnc == MO.getImm())
1294     return false;
1295 
1296   MO.setImm(NewEnc);
1297   return true;
1298 }
1299 
1300 /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1301 /// and if so, which counter it is waiting on.
counterTypeForInstr(unsigned Opcode)1302 static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1303   switch (Opcode) {
1304   case AMDGPU::S_WAIT_LOADCNT:
1305     return LOAD_CNT;
1306   case AMDGPU::S_WAIT_EXPCNT:
1307     return EXP_CNT;
1308   case AMDGPU::S_WAIT_STORECNT:
1309     return STORE_CNT;
1310   case AMDGPU::S_WAIT_SAMPLECNT:
1311     return SAMPLE_CNT;
1312   case AMDGPU::S_WAIT_BVHCNT:
1313     return BVH_CNT;
1314   case AMDGPU::S_WAIT_DSCNT:
1315     return DS_CNT;
1316   case AMDGPU::S_WAIT_KMCNT:
1317     return KM_CNT;
1318   case AMDGPU::S_WAIT_XCNT:
1319     return X_CNT;
1320   default:
1321     return {};
1322   }
1323 }
1324 
promoteSoftWaitCnt(MachineInstr * Waitcnt) const1325 bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1326   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1327   if (Opcode == Waitcnt->getOpcode())
1328     return false;
1329 
1330   Waitcnt->setDesc(TII->get(Opcode));
1331   return true;
1332 }
1333 
1334 /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1335 /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1336 /// from \p Wait that were added by previous passes. Currently this pass
1337 /// conservatively assumes that these preexisting waits are required for
1338 /// correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const1339 bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1340     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1341     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1342   assert(ST);
1343   assert(isNormalMode(MaxCounter));
1344 
1345   bool Modified = false;
1346   MachineInstr *WaitcntInstr = nullptr;
1347   MachineInstr *WaitcntVsCntInstr = nullptr;
1348 
1349   LLVM_DEBUG({
1350     dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1351     if (It == OldWaitcntInstr.getParent()->instr_end())
1352       dbgs() << "end of block\n";
1353     else
1354       dbgs() << *It;
1355   });
1356 
1357   for (auto &II :
1358        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1359     LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1360     if (II.isMetaInstruction()) {
1361       LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1362       continue;
1363     }
1364 
1365     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1366     bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1367 
1368     // Update required wait count. If this is a soft waitcnt (= it was added
1369     // by an earlier pass), it may be entirely removed.
1370     if (Opcode == AMDGPU::S_WAITCNT) {
1371       unsigned IEnc = II.getOperand(0).getImm();
1372       AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1373       if (TrySimplify)
1374         ScoreBrackets.simplifyWaitcnt(OldWait);
1375       Wait = Wait.combined(OldWait);
1376 
1377       // Merge consecutive waitcnt of the same type by erasing multiples.
1378       if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1379         II.eraseFromParent();
1380         Modified = true;
1381       } else
1382         WaitcntInstr = &II;
1383     } else {
1384       assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1385       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1386 
1387       unsigned OldVSCnt =
1388           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1389       if (TrySimplify)
1390         ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1391       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1392 
1393       if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1394         II.eraseFromParent();
1395         Modified = true;
1396       } else
1397         WaitcntVsCntInstr = &II;
1398     }
1399   }
1400 
1401   if (WaitcntInstr) {
1402     Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1403                                          AMDGPU::encodeWaitcnt(IV, Wait));
1404     Modified |= promoteSoftWaitCnt(WaitcntInstr);
1405 
1406     ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1407     ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1408     ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1409     Wait.LoadCnt = ~0u;
1410     Wait.ExpCnt = ~0u;
1411     Wait.DsCnt = ~0u;
1412 
1413     LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1414                    ? dbgs()
1415                          << "applied pre-existing waitcnt\n"
1416                          << "New Instr at block end: " << *WaitcntInstr << '\n'
1417                    : dbgs() << "applied pre-existing waitcnt\n"
1418                             << "Old Instr: " << *It
1419                             << "New Instr: " << *WaitcntInstr << '\n');
1420   }
1421 
1422   if (WaitcntVsCntInstr) {
1423     Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1424                                          AMDGPU::OpName::simm16, Wait.StoreCnt);
1425     Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1426 
1427     ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1428     Wait.StoreCnt = ~0u;
1429 
1430     LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1431                    ? dbgs() << "applied pre-existing waitcnt\n"
1432                             << "New Instr at block end: " << *WaitcntVsCntInstr
1433                             << '\n'
1434                    : dbgs() << "applied pre-existing waitcnt\n"
1435                             << "Old Instr: " << *It
1436                             << "New Instr: " << *WaitcntVsCntInstr << '\n');
1437   }
1438 
1439   return Modified;
1440 }
1441 
1442 /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1443 /// required counters in \p Wait
createNewWaitcnt(MachineBasicBlock & Block,MachineBasicBlock::instr_iterator It,AMDGPU::Waitcnt Wait)1444 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1445     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1446     AMDGPU::Waitcnt Wait) {
1447   assert(ST);
1448   assert(isNormalMode(MaxCounter));
1449 
1450   bool Modified = false;
1451   const DebugLoc &DL = Block.findDebugLoc(It);
1452 
1453   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1454   // single instruction while VScnt has its own instruction.
1455   if (Wait.hasWaitExceptStoreCnt()) {
1456     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1457     [[maybe_unused]] auto SWaitInst =
1458         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1459     Modified = true;
1460 
1461     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1462                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1463                dbgs() << "New Instr: " << *SWaitInst << '\n');
1464   }
1465 
1466   if (Wait.hasWaitStoreCnt()) {
1467     assert(ST->hasVscnt());
1468 
1469     [[maybe_unused]] auto SWaitInst =
1470         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1471             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1472             .addImm(Wait.StoreCnt);
1473     Modified = true;
1474 
1475     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1476                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1477                dbgs() << "New Instr: " << *SWaitInst << '\n');
1478   }
1479 
1480   return Modified;
1481 }
1482 
1483 AMDGPU::Waitcnt
getAllZeroWaitcnt(bool IncludeVSCnt) const1484 WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1485   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1486 }
1487 
1488 AMDGPU::Waitcnt
getAllZeroWaitcnt(bool IncludeVSCnt) const1489 WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1490   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1491                          ~0u /* XCNT */);
1492 }
1493 
1494 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1495 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1496 /// were added by previous passes. Currently this pass conservatively
1497 /// assumes that these preexisting waits are required for correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const1498 bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1499     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1500     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1501   assert(ST);
1502   assert(!isNormalMode(MaxCounter));
1503 
1504   bool Modified = false;
1505   MachineInstr *CombinedLoadDsCntInstr = nullptr;
1506   MachineInstr *CombinedStoreDsCntInstr = nullptr;
1507   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1508 
1509   LLVM_DEBUG({
1510     dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1511     if (It == OldWaitcntInstr.getParent()->instr_end())
1512       dbgs() << "end of block\n";
1513     else
1514       dbgs() << *It;
1515   });
1516 
1517   for (auto &II :
1518        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1519     LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1520     if (II.isMetaInstruction()) {
1521       LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1522       continue;
1523     }
1524 
1525     MachineInstr **UpdatableInstr;
1526 
1527     // Update required wait count. If this is a soft waitcnt (= it was added
1528     // by an earlier pass), it may be entirely removed.
1529 
1530     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1531     bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1532 
1533     // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1534     // attempt to do more than that either.
1535     if (Opcode == AMDGPU::S_WAITCNT)
1536       continue;
1537 
1538     if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1539       unsigned OldEnc =
1540           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1541       AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1542       if (TrySimplify)
1543         ScoreBrackets.simplifyWaitcnt(OldWait);
1544       Wait = Wait.combined(OldWait);
1545       UpdatableInstr = &CombinedLoadDsCntInstr;
1546     } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1547       unsigned OldEnc =
1548           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1549       AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1550       if (TrySimplify)
1551         ScoreBrackets.simplifyWaitcnt(OldWait);
1552       Wait = Wait.combined(OldWait);
1553       UpdatableInstr = &CombinedStoreDsCntInstr;
1554     } else {
1555       std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1556       assert(CT.has_value());
1557       unsigned OldCnt =
1558           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1559       if (TrySimplify)
1560         ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1561       addWait(Wait, CT.value(), OldCnt);
1562       UpdatableInstr = &WaitInstrs[CT.value()];
1563     }
1564 
1565     // Merge consecutive waitcnt of the same type by erasing multiples.
1566     if (!*UpdatableInstr) {
1567       *UpdatableInstr = &II;
1568     } else {
1569       II.eraseFromParent();
1570       Modified = true;
1571     }
1572   }
1573 
1574   if (CombinedLoadDsCntInstr) {
1575     // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1576     // to be waited for. Otherwise, let the instruction be deleted so
1577     // the appropriate single counter wait instruction can be inserted
1578     // instead, when new S_WAIT_*CNT instructions are inserted by
1579     // createNewWaitcnt(). As a side effect, resetting the wait counts will
1580     // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1581     // the loop below that deals with single counter instructions.
1582     if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1583       unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1584       Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1585                                            AMDGPU::OpName::simm16, NewEnc);
1586       Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1587       ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1588       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1589       Wait.LoadCnt = ~0u;
1590       Wait.DsCnt = ~0u;
1591 
1592       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1593                      ? dbgs() << "applied pre-existing waitcnt\n"
1594                               << "New Instr at block end: "
1595                               << *CombinedLoadDsCntInstr << '\n'
1596                      : dbgs() << "applied pre-existing waitcnt\n"
1597                               << "Old Instr: " << *It << "New Instr: "
1598                               << *CombinedLoadDsCntInstr << '\n');
1599     } else {
1600       CombinedLoadDsCntInstr->eraseFromParent();
1601       Modified = true;
1602     }
1603   }
1604 
1605   if (CombinedStoreDsCntInstr) {
1606     // Similarly for S_WAIT_STORECNT_DSCNT.
1607     if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1608       unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1609       Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1610                                            AMDGPU::OpName::simm16, NewEnc);
1611       Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1612       ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1613       ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1614       Wait.StoreCnt = ~0u;
1615       Wait.DsCnt = ~0u;
1616 
1617       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1618                      ? dbgs() << "applied pre-existing waitcnt\n"
1619                               << "New Instr at block end: "
1620                               << *CombinedStoreDsCntInstr << '\n'
1621                      : dbgs() << "applied pre-existing waitcnt\n"
1622                               << "Old Instr: " << *It << "New Instr: "
1623                               << *CombinedStoreDsCntInstr << '\n');
1624     } else {
1625       CombinedStoreDsCntInstr->eraseFromParent();
1626       Modified = true;
1627     }
1628   }
1629 
1630   // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1631   // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1632   // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1633   // instructions so that createNewWaitcnt() will create new combined
1634   // instructions to replace them.
1635 
1636   if (Wait.DsCnt != ~0u) {
1637     // This is a vector of addresses in WaitInstrs pointing to instructions
1638     // that should be removed if they are present.
1639     SmallVector<MachineInstr **, 2> WaitsToErase;
1640 
1641     // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1642     // both) need to be waited for, ensure that there are no existing
1643     // individual wait count instructions for these.
1644 
1645     if (Wait.LoadCnt != ~0u) {
1646       WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1647       WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1648     } else if (Wait.StoreCnt != ~0u) {
1649       WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1650       WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1651     }
1652 
1653     for (MachineInstr **WI : WaitsToErase) {
1654       if (!*WI)
1655         continue;
1656 
1657       (*WI)->eraseFromParent();
1658       *WI = nullptr;
1659       Modified = true;
1660     }
1661   }
1662 
1663   for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1664     if (!WaitInstrs[CT])
1665       continue;
1666 
1667     unsigned NewCnt = getWait(Wait, CT);
1668     if (NewCnt != ~0u) {
1669       Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1670                                            AMDGPU::OpName::simm16, NewCnt);
1671       Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1672 
1673       ScoreBrackets.applyWaitcnt(CT, NewCnt);
1674       setNoWait(Wait, CT);
1675 
1676       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1677                      ? dbgs() << "applied pre-existing waitcnt\n"
1678                               << "New Instr at block end: " << *WaitInstrs[CT]
1679                               << '\n'
1680                      : dbgs() << "applied pre-existing waitcnt\n"
1681                               << "Old Instr: " << *It
1682                               << "New Instr: " << *WaitInstrs[CT] << '\n');
1683     } else {
1684       WaitInstrs[CT]->eraseFromParent();
1685       Modified = true;
1686     }
1687   }
1688 
1689   return Modified;
1690 }
1691 
1692 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
createNewWaitcnt(MachineBasicBlock & Block,MachineBasicBlock::instr_iterator It,AMDGPU::Waitcnt Wait)1693 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1694     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1695     AMDGPU::Waitcnt Wait) {
1696   assert(ST);
1697   assert(!isNormalMode(MaxCounter));
1698 
1699   bool Modified = false;
1700   const DebugLoc &DL = Block.findDebugLoc(It);
1701 
1702   // Check for opportunities to use combined wait instructions.
1703   if (Wait.DsCnt != ~0u) {
1704     MachineInstr *SWaitInst = nullptr;
1705 
1706     if (Wait.LoadCnt != ~0u) {
1707       unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1708 
1709       SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1710                       .addImm(Enc);
1711 
1712       Wait.LoadCnt = ~0u;
1713       Wait.DsCnt = ~0u;
1714     } else if (Wait.StoreCnt != ~0u) {
1715       unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1716 
1717       SWaitInst =
1718           BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1719               .addImm(Enc);
1720 
1721       Wait.StoreCnt = ~0u;
1722       Wait.DsCnt = ~0u;
1723     }
1724 
1725     if (SWaitInst) {
1726       Modified = true;
1727 
1728       LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1729                  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1730                  dbgs() << "New Instr: " << *SWaitInst << '\n');
1731     }
1732   }
1733 
1734   // Generate an instruction for any remaining counter that needs
1735   // waiting for.
1736 
1737   for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1738     unsigned Count = getWait(Wait, CT);
1739     if (Count == ~0u)
1740       continue;
1741 
1742     [[maybe_unused]] auto SWaitInst =
1743         BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1744             .addImm(Count);
1745 
1746     Modified = true;
1747 
1748     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1749                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1750                dbgs() << "New Instr: " << *SWaitInst << '\n');
1751   }
1752 
1753   return Modified;
1754 }
1755 
readsVCCZ(const MachineInstr & MI)1756 static bool readsVCCZ(const MachineInstr &MI) {
1757   unsigned Opc = MI.getOpcode();
1758   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1759          !MI.getOperand(1).isUndef();
1760 }
1761 
1762 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
callWaitsOnFunctionEntry(const MachineInstr & MI)1763 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1764   // Currently all conventions wait, but this may not always be the case.
1765   //
1766   // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1767   // senses to omit the wait and do it in the caller.
1768   return true;
1769 }
1770 
1771 /// \returns true if the callee is expected to wait for any outstanding waits
1772 /// before returning.
callWaitsOnFunctionReturn(const MachineInstr & MI)1773 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1774 
1775 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
1776 ///  Instructions of a given type are returned in order,
1777 ///  but instructions of different types can complete out of order.
1778 ///  We rely on this in-order completion
1779 ///  and simply assign a score to the memory access instructions.
1780 ///  We keep track of the active "score bracket" to determine
1781 ///  if an access of a memory read requires an s_waitcnt
1782 ///  and if so what the value of each counter is.
1783 ///  The "score bracket" is bound by the lower bound and upper bound
1784 ///  scores (*_score_LB and *_score_ub respectively).
1785 ///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1786 ///  flush the vmcnt counter here.
generateWaitcntInstBefore(MachineInstr & MI,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr,bool FlushVmCnt)1787 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1788                                                  WaitcntBrackets &ScoreBrackets,
1789                                                  MachineInstr *OldWaitcntInstr,
1790                                                  bool FlushVmCnt) {
1791   setForceEmitWaitcnt();
1792 
1793   assert(!MI.isMetaInstruction());
1794 
1795   AMDGPU::Waitcnt Wait;
1796 
1797   // FIXME: This should have already been handled by the memory legalizer.
1798   // Removing this currently doesn't affect any lit tests, but we need to
1799   // verify that nothing was relying on this. The number of buffer invalidates
1800   // being handled here should not be expanded.
1801   if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1802       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1803       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1804       MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1805       MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1806     Wait.LoadCnt = 0;
1807   }
1808 
1809   // All waits must be resolved at call return.
1810   // NOTE: this could be improved with knowledge of all call sites or
1811   //   with knowledge of the called routines.
1812   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1813       MI.getOpcode() == AMDGPU::SI_RETURN ||
1814       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1815       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1816     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1817   }
1818   // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1819   // Technically the hardware will do this on its own if we don't, but that
1820   // might cost extra cycles compared to doing it explicitly.
1821   // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1822   // have to wait for outstanding VMEM stores. In this case it can be useful to
1823   // send a message to explicitly release all VGPRs before the stores have
1824   // completed, but it is only safe to do this if there are no outstanding
1825   // scratch stores.
1826   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1827            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1828     if (!WCG->isOptNone() &&
1829         (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1830          (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1831           ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1832           !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1833       ReleaseVGPRInsts.insert(&MI);
1834   }
1835   // Resolve vm waits before gs-done.
1836   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1837             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1838            ST->hasLegacyGeometry() &&
1839            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1840             AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1841     Wait.LoadCnt = 0;
1842   }
1843 
1844   // Export & GDS instructions do not read the EXEC mask until after the export
1845   // is granted (which can occur well after the instruction is issued).
1846   // The shader program must flush all EXP operations on the export-count
1847   // before overwriting the EXEC mask.
1848   else {
1849     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1850       // Export and GDS are tracked individually, either may trigger a waitcnt
1851       // for EXEC.
1852       if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1853           ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1854           ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1855           ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1856         Wait.ExpCnt = 0;
1857       }
1858     }
1859 
1860     // Wait for any pending GDS instruction to complete before any
1861     // "Always GDS" instruction.
1862     if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1863       addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1864 
1865     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1866       // The function is going to insert a wait on everything in its prolog.
1867       // This still needs to be careful if the call target is a load (e.g. a GOT
1868       // load). We also need to check WAW dependency with saved PC.
1869       Wait = AMDGPU::Waitcnt();
1870 
1871       const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1872       if (CallAddrOp.isReg()) {
1873         RegInterval CallAddrOpInterval =
1874             ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
1875 
1876         ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1877                                     Wait);
1878 
1879         if (const auto *RtnAddrOp =
1880                 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1881           RegInterval RtnAddrOpInterval =
1882               ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
1883 
1884           ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1885                                       Wait);
1886         }
1887       }
1888     } else {
1889       // FIXME: Should not be relying on memoperands.
1890       // Look at the source operands of every instruction to see if
1891       // any of them results from a previous memory operation that affects
1892       // its current usage. If so, an s_waitcnt instruction needs to be
1893       // emitted.
1894       // If the source operand was defined by a load, add the s_waitcnt
1895       // instruction.
1896       //
1897       // Two cases are handled for destination operands:
1898       // 1) If the destination operand was defined by a load, add the s_waitcnt
1899       // instruction to guarantee the right WAW order.
1900       // 2) If a destination operand that was used by a recent export/store ins,
1901       // add s_waitcnt on exp_cnt to guarantee the WAR order.
1902 
1903       for (const MachineMemOperand *Memop : MI.memoperands()) {
1904         const Value *Ptr = Memop->getValue();
1905         if (Memop->isStore()) {
1906           if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
1907             addWait(Wait, SmemAccessCounter, 0);
1908             if (PDT->dominates(MI.getParent(), It->second))
1909               SLoadAddresses.erase(It);
1910           }
1911         }
1912         unsigned AS = Memop->getAddrSpace();
1913         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1914           continue;
1915         // No need to wait before load from VMEM to LDS.
1916         if (TII->mayWriteLDSThroughDMA(MI))
1917           continue;
1918 
1919         // LOAD_CNT is only relevant to vgpr or LDS.
1920         unsigned RegNo = FIRST_LDS_VGPR;
1921         // Only objects with alias scope info were added to LDSDMAScopes array.
1922         // In the absense of the scope info we will not be able to disambiguate
1923         // aliasing here. There is no need to try searching for a corresponding
1924         // store slot. This is conservatively correct because in that case we
1925         // will produce a wait using the first (general) LDS DMA wait slot which
1926         // will wait on all of them anyway.
1927         if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1928           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1929           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1930             if (MI.mayAlias(AA, *LDSDMAStores[I], true))
1931               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1932           }
1933         } else {
1934           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1935         }
1936         if (Memop->isStore()) {
1937           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1938         }
1939       }
1940 
1941       // Loop over use and def operands.
1942       for (const MachineOperand &Op : MI.operands()) {
1943         if (!Op.isReg())
1944           continue;
1945 
1946         // If the instruction does not read tied source, skip the operand.
1947         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1948           continue;
1949 
1950         RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
1951 
1952         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1953         if (IsVGPR) {
1954           // Implicit VGPR defs and uses are never a part of the memory
1955           // instructions description and usually present to account for
1956           // super-register liveness.
1957           // TODO: Most of the other instructions also have implicit uses
1958           // for the liveness accounting only.
1959           if (Op.isImplicit() && MI.mayLoadOrStore())
1960             continue;
1961 
1962           // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1963           // previous write and this write are the same type of VMEM
1964           // instruction, in which case they are (in some architectures)
1965           // guaranteed to write their results in order anyway.
1966           // Additionally check instructions where Point Sample Acceleration
1967           // might be applied.
1968           if (Op.isUse() || !updateVMCntOnly(MI) ||
1969               ScoreBrackets.hasOtherPendingVmemTypes(Interval,
1970                                                      getVmemType(MI)) ||
1971               ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
1972               !ST->hasVmemWriteVgprInOrder()) {
1973             ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
1974             ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
1975             ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
1976             ScoreBrackets.clearVgprVmemTypes(Interval);
1977           }
1978 
1979           if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1980             ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
1981           }
1982           ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
1983         } else {
1984           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
1985         }
1986 
1987         if (hasXcnt() && Op.isDef())
1988           ScoreBrackets.determineWait(X_CNT, Interval, Wait);
1989       }
1990     }
1991   }
1992 
1993   // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1994   // not, we need to ensure the subtarget is capable of backing off barrier
1995   // instructions in case there are any outstanding memory operations that may
1996   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1997   if (TII->isBarrierStart(MI.getOpcode()) &&
1998       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1999     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2000   }
2001 
2002   // TODO: Remove this work-around, enable the assert for Bug 457939
2003   //       after fixing the scheduler. Also, the Shader Compiler code is
2004   //       independent of target.
2005   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
2006     if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2007       Wait.DsCnt = 0;
2008     }
2009   }
2010 
2011   // Verify that the wait is actually needed.
2012   ScoreBrackets.simplifyWaitcnt(Wait);
2013 
2014   // When forcing emit, we need to skip terminators because that would break the
2015   // terminators of the MBB if we emit a waitcnt between terminators.
2016   if (ForceEmitZeroFlag && !MI.isTerminator())
2017     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2018 
2019   if (ForceEmitWaitcnt[LOAD_CNT])
2020     Wait.LoadCnt = 0;
2021   if (ForceEmitWaitcnt[EXP_CNT])
2022     Wait.ExpCnt = 0;
2023   if (ForceEmitWaitcnt[DS_CNT])
2024     Wait.DsCnt = 0;
2025   if (ForceEmitWaitcnt[SAMPLE_CNT])
2026     Wait.SampleCnt = 0;
2027   if (ForceEmitWaitcnt[BVH_CNT])
2028     Wait.BvhCnt = 0;
2029   if (ForceEmitWaitcnt[KM_CNT])
2030     Wait.KmCnt = 0;
2031   if (ForceEmitWaitcnt[X_CNT])
2032     Wait.XCnt = 0;
2033 
2034   if (FlushVmCnt) {
2035     if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2036       Wait.LoadCnt = 0;
2037     if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2038       Wait.SampleCnt = 0;
2039     if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2040       Wait.BvhCnt = 0;
2041   }
2042 
2043   if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2044     Wait.LoadCnt = 0;
2045 
2046   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2047                          OldWaitcntInstr);
2048 }
2049 
generateWaitcnt(AMDGPU::Waitcnt Wait,MachineBasicBlock::instr_iterator It,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr)2050 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2051                                        MachineBasicBlock::instr_iterator It,
2052                                        MachineBasicBlock &Block,
2053                                        WaitcntBrackets &ScoreBrackets,
2054                                        MachineInstr *OldWaitcntInstr) {
2055   bool Modified = false;
2056 
2057   if (OldWaitcntInstr)
2058     // Try to merge the required wait with preexisting waitcnt instructions.
2059     // Also erase redundant waitcnt.
2060     Modified =
2061         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2062 
2063   // Any counts that could have been applied to any existing waitcnt
2064   // instructions will have been done so, now deal with any remaining.
2065   ScoreBrackets.applyWaitcnt(Wait);
2066 
2067   // ExpCnt can be merged into VINTERP.
2068   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2069       SIInstrInfo::isVINTERP(*It)) {
2070     MachineOperand *WaitExp =
2071         TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2072     if (Wait.ExpCnt < WaitExp->getImm()) {
2073       WaitExp->setImm(Wait.ExpCnt);
2074       Modified = true;
2075     }
2076     Wait.ExpCnt = ~0u;
2077 
2078     LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2079                       << "Update Instr: " << *It);
2080   }
2081 
2082   // XCnt may be already consumed by a load wait.
2083   if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
2084       !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2085     Wait.XCnt = ~0u;
2086 
2087   if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
2088       !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2089     Wait.XCnt = ~0u;
2090 
2091   // Since the translation for VMEM addresses occur in-order, we can skip the
2092   // XCnt if the current instruction is of VMEM type and has a memory dependency
2093   // with another VMEM instruction in flight.
2094   if (Wait.XCnt != ~0u && isVmemAccess(*It))
2095     Wait.XCnt = ~0u;
2096 
2097   if (WCG->createNewWaitcnt(Block, It, Wait))
2098     Modified = true;
2099 
2100   return Modified;
2101 }
2102 
2103 // This is a flat memory operation. Check to see if it has memory tokens other
2104 // than LDS. Other address spaces supported by flat memory operations involve
2105 // global memory.
mayAccessVMEMThroughFlat(const MachineInstr & MI) const2106 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
2107   assert(TII->isFLAT(MI));
2108 
2109   // All flat instructions use the VMEM counter.
2110   assert(TII->usesVM_CNT(MI));
2111 
2112   // If there are no memory operands then conservatively assume the flat
2113   // operation may access VMEM.
2114   if (MI.memoperands_empty())
2115     return true;
2116 
2117   // See if any memory operand specifies an address space that involves VMEM.
2118   // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
2119   // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
2120   // (GDS) address space is not supported by flat operations. Therefore, simply
2121   // return true unless only the LDS address space is found.
2122   for (const MachineMemOperand *Memop : MI.memoperands()) {
2123     unsigned AS = Memop->getAddrSpace();
2124     assert(AS != AMDGPUAS::REGION_ADDRESS);
2125     if (AS != AMDGPUAS::LOCAL_ADDRESS)
2126       return true;
2127   }
2128 
2129   return false;
2130 }
2131 
2132 // This is a flat memory operation. Check to see if it has memory tokens for
2133 // either LDS or FLAT.
mayAccessLDSThroughFlat(const MachineInstr & MI) const2134 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2135   assert(TII->isFLAT(MI));
2136 
2137   // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2138   if (!TII->usesLGKM_CNT(MI))
2139     return false;
2140 
2141   // If in tgsplit mode then there can be no use of LDS.
2142   if (ST->isTgSplitEnabled())
2143     return false;
2144 
2145   // If there are no memory operands then conservatively assume the flat
2146   // operation may access LDS.
2147   if (MI.memoperands_empty())
2148     return true;
2149 
2150   // See if any memory operand specifies an address space that involves LDS.
2151   for (const MachineMemOperand *Memop : MI.memoperands()) {
2152     unsigned AS = Memop->getAddrSpace();
2153     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
2154       return true;
2155   }
2156 
2157   return false;
2158 }
2159 
2160 // This is a flat memory operation. Check to see if it has memory tokens for
2161 // either scratch or FLAT.
mayAccessScratchThroughFlat(const MachineInstr & MI) const2162 bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2163     const MachineInstr &MI) const {
2164   assert(TII->isFLAT(MI));
2165 
2166   // SCRATCH instructions always access scratch.
2167   if (TII->isFLATScratch(MI))
2168     return true;
2169 
2170   // GLOBAL instructions never access scratch.
2171   if (TII->isFLATGlobal(MI))
2172     return false;
2173 
2174   // If there are no memory operands then conservatively assume the flat
2175   // operation may access scratch.
2176   if (MI.memoperands_empty())
2177     return true;
2178 
2179   // See if any memory operand specifies an address space that involves scratch.
2180   return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
2181     unsigned AS = Memop->getAddrSpace();
2182     return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2183   });
2184 }
2185 
isVmemAccess(const MachineInstr & MI) const2186 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2187   return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
2188          (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2189 }
2190 
isGFX12CacheInvOrWBInst(MachineInstr & Inst)2191 static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
2192   auto Opc = Inst.getOpcode();
2193   return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2194          Opc == AMDGPU::GLOBAL_WBINV;
2195 }
2196 
2197 // Return true if the next instruction is S_ENDPGM, following fallthrough
2198 // blocks if necessary.
isNextENDPGM(MachineBasicBlock::instr_iterator It,MachineBasicBlock * Block) const2199 bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2200                                     MachineBasicBlock *Block) const {
2201   auto BlockEnd = Block->getParent()->end();
2202   auto BlockIter = Block->getIterator();
2203 
2204   while (true) {
2205     if (It.isEnd()) {
2206       if (++BlockIter != BlockEnd) {
2207         It = BlockIter->instr_begin();
2208         continue;
2209       }
2210 
2211       return false;
2212     }
2213 
2214     if (!It->isMetaInstruction())
2215       break;
2216 
2217     It++;
2218   }
2219 
2220   assert(!It.isEnd());
2221 
2222   return It->getOpcode() == AMDGPU::S_ENDPGM;
2223 }
2224 
2225 // Add a wait after an instruction if architecture requirements mandate one.
insertForcedWaitAfter(MachineInstr & Inst,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets)2226 bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2227                                              MachineBasicBlock &Block,
2228                                              WaitcntBrackets &ScoreBrackets) {
2229   AMDGPU::Waitcnt Wait;
2230   bool NeedsEndPGMCheck = false;
2231 
2232   if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2233     Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2234                                   !SIInstrInfo::isAtomicRet(Inst));
2235 
2236   if (TII->isAlwaysGDS(Inst.getOpcode())) {
2237     Wait.DsCnt = 0;
2238     NeedsEndPGMCheck = true;
2239   }
2240 
2241   ScoreBrackets.simplifyWaitcnt(Wait);
2242 
2243   auto SuccessorIt = std::next(Inst.getIterator());
2244   bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2245                                 /*OldWaitcntInstr=*/nullptr);
2246 
2247   if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2248     BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2249         .addImm(0);
2250   }
2251 
2252   return Result;
2253 }
2254 
updateEventWaitcntAfter(MachineInstr & Inst,WaitcntBrackets * ScoreBrackets)2255 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2256                                                WaitcntBrackets *ScoreBrackets) {
2257   // Now look at the instruction opcode. If it is a memory access
2258   // instruction, update the upper-bound of the appropriate counter's
2259   // bracket and the destination operand scores.
2260   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2261 
2262   bool IsVMEMAccess = false;
2263   bool IsSMEMAccess = false;
2264   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2265     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2266         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2267       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2268       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2269       ScoreBrackets->setPendingGDS();
2270     } else {
2271       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2272     }
2273   } else if (TII->isFLAT(Inst)) {
2274     if (isGFX12CacheInvOrWBInst(Inst)) {
2275       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2276                                    Inst);
2277       return;
2278     }
2279 
2280     assert(Inst.mayLoadOrStore());
2281 
2282     int FlatASCount = 0;
2283 
2284     if (mayAccessVMEMThroughFlat(Inst)) {
2285       ++FlatASCount;
2286       IsVMEMAccess = true;
2287       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2288                                    Inst);
2289     }
2290 
2291     if (mayAccessLDSThroughFlat(Inst)) {
2292       ++FlatASCount;
2293       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2294     }
2295 
2296     // A Flat memory operation must access at least one address space.
2297     assert(FlatASCount);
2298 
2299     // This is a flat memory operation that access both VMEM and LDS, so note it
2300     // - it will require that both the VM and LGKM be flushed to zero if it is
2301     // pending when a VM or LGKM dependency occurs.
2302     if (FlatASCount > 1)
2303       ScoreBrackets->setPendingFlat();
2304   } else if (SIInstrInfo::isVMEM(Inst) &&
2305              !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
2306     IsVMEMAccess = true;
2307     ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2308                                  Inst);
2309 
2310     if (ST->vmemWriteNeedsExpWaitcnt() &&
2311         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2312       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2313     }
2314   } else if (TII->isSMRD(Inst)) {
2315     IsSMEMAccess = true;
2316     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2317   } else if (Inst.isCall()) {
2318     if (callWaitsOnFunctionReturn(Inst)) {
2319       // Act as a wait on everything
2320       ScoreBrackets->applyWaitcnt(
2321           WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2322       ScoreBrackets->setStateOnFunctionEntryOrReturn();
2323     } else {
2324       // May need to way wait for anything.
2325       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2326     }
2327   } else if (SIInstrInfo::isLDSDIR(Inst)) {
2328     ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2329   } else if (TII->isVINTERP(Inst)) {
2330     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2331     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2332   } else if (SIInstrInfo::isEXP(Inst)) {
2333     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2334     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2335       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2336     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2337       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2338     else
2339       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2340   } else {
2341     switch (Inst.getOpcode()) {
2342     case AMDGPU::S_SENDMSG:
2343     case AMDGPU::S_SENDMSG_RTN_B32:
2344     case AMDGPU::S_SENDMSG_RTN_B64:
2345     case AMDGPU::S_SENDMSGHALT:
2346       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2347       break;
2348     case AMDGPU::S_MEMTIME:
2349     case AMDGPU::S_MEMREALTIME:
2350     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2351     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2352     case AMDGPU::S_GET_BARRIER_STATE_M0:
2353     case AMDGPU::S_GET_BARRIER_STATE_IMM:
2354       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2355       break;
2356     }
2357   }
2358 
2359   if (!hasXcnt())
2360     return;
2361 
2362   if (IsVMEMAccess)
2363     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
2364 
2365   if (IsSMEMAccess)
2366     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
2367 }
2368 
mergeScore(const MergeInfo & M,unsigned & Score,unsigned OtherScore)2369 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2370                                  unsigned OtherScore) {
2371   unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2372   unsigned OtherShifted =
2373       OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2374   Score = std::max(MyShifted, OtherShifted);
2375   return OtherShifted > MyShifted;
2376 }
2377 
2378 /// Merge the pending events and associater score brackets of \p Other into
2379 /// this brackets status.
2380 ///
2381 /// Returns whether the merge resulted in a change that requires tighter waits
2382 /// (i.e. the merged brackets strictly dominate the original brackets).
merge(const WaitcntBrackets & Other)2383 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2384   bool StrictDom = false;
2385 
2386   VgprUB = std::max(VgprUB, Other.VgprUB);
2387   SgprUB = std::max(SgprUB, Other.SgprUB);
2388 
2389   for (auto T : inst_counter_types(MaxCounter)) {
2390     // Merge event flags for this counter
2391     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2392     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2393     if (OtherEvents & ~OldEvents)
2394       StrictDom = true;
2395     PendingEvents |= OtherEvents;
2396 
2397     // Merge scores for this counter
2398     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2399     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2400     const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2401     if (NewUB < ScoreLBs[T])
2402       report_fatal_error("waitcnt score overflow");
2403 
2404     MergeInfo M;
2405     M.OldLB = ScoreLBs[T];
2406     M.OtherLB = Other.ScoreLBs[T];
2407     M.MyShift = NewUB - ScoreUBs[T];
2408     M.OtherShift = NewUB - Other.ScoreUBs[T];
2409 
2410     ScoreUBs[T] = NewUB;
2411 
2412     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2413 
2414     if (T == DS_CNT)
2415       StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2416 
2417     for (int J = 0; J <= VgprUB; J++)
2418       StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2419 
2420     if (isSmemCounter(T)) {
2421       unsigned Idx = getSgprScoresIdx(T);
2422       for (int J = 0; J <= SgprUB; J++)
2423         StrictDom |=
2424             mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
2425     }
2426   }
2427 
2428   for (int J = 0; J <= VgprUB; J++) {
2429     unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2430     StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2431     VgprVmemTypes[J] = NewVmemTypes;
2432   }
2433 
2434   return StrictDom;
2435 }
2436 
isWaitInstr(MachineInstr & Inst)2437 static bool isWaitInstr(MachineInstr &Inst) {
2438   unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2439   return Opcode == AMDGPU::S_WAITCNT ||
2440          (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2441           Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2442          Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2443          Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2444          counterTypeForInstr(Opcode).has_value();
2445 }
2446 
2447 // Generate s_waitcnt instructions where needed.
insertWaitcntInBlock(MachineFunction & MF,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets)2448 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2449                                             MachineBasicBlock &Block,
2450                                             WaitcntBrackets &ScoreBrackets) {
2451   bool Modified = false;
2452 
2453   LLVM_DEBUG({
2454     dbgs() << "*** Begin Block: ";
2455     Block.printName(dbgs());
2456     ScoreBrackets.dump();
2457   });
2458 
2459   // Track the correctness of vccz through this basic block. There are two
2460   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2461   // ST->partialVCCWritesUpdateVCCZ().
2462   bool VCCZCorrect = true;
2463   if (ST->hasReadVCCZBug()) {
2464     // vccz could be incorrect at a basic block boundary if a predecessor wrote
2465     // to vcc and then issued an smem load.
2466     VCCZCorrect = false;
2467   } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2468     // vccz could be incorrect at a basic block boundary if a predecessor wrote
2469     // to vcc_lo or vcc_hi.
2470     VCCZCorrect = false;
2471   }
2472 
2473   // Walk over the instructions.
2474   MachineInstr *OldWaitcntInstr = nullptr;
2475 
2476   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2477                                          E = Block.instr_end();
2478        Iter != E;) {
2479     MachineInstr &Inst = *Iter;
2480     if (Inst.isMetaInstruction()) {
2481       ++Iter;
2482       continue;
2483     }
2484 
2485     // Track pre-existing waitcnts that were added in earlier iterations or by
2486     // the memory legalizer.
2487     if (isWaitInstr(Inst)) {
2488       if (!OldWaitcntInstr)
2489         OldWaitcntInstr = &Inst;
2490       ++Iter;
2491       continue;
2492     }
2493 
2494     bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2495                       isPreheaderToFlush(Block, ScoreBrackets);
2496 
2497     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2498     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2499                                           FlushVmCnt);
2500     OldWaitcntInstr = nullptr;
2501 
2502     // Restore vccz if it's not known to be correct already.
2503     bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2504 
2505     // Don't examine operands unless we need to track vccz correctness.
2506     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2507       if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2508           Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2509         // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2510         if (!ST->partialVCCWritesUpdateVCCZ())
2511           VCCZCorrect = false;
2512       } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2513         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2514         // vccz bit, so when we detect that an instruction may read from a
2515         // corrupt vccz bit, we need to:
2516         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2517         //    operations to complete.
2518         // 2. Restore the correct value of vccz by writing the current value
2519         //    of vcc back to vcc.
2520         if (ST->hasReadVCCZBug() &&
2521             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2522           // Writes to vcc while there's an outstanding smem read may get
2523           // clobbered as soon as any read completes.
2524           VCCZCorrect = false;
2525         } else {
2526           // Writes to vcc will fix any incorrect value in vccz.
2527           VCCZCorrect = true;
2528         }
2529       }
2530     }
2531 
2532     if (TII->isSMRD(Inst)) {
2533       for (const MachineMemOperand *Memop : Inst.memoperands()) {
2534         // No need to handle invariant loads when avoiding WAR conflicts, as
2535         // there cannot be a vector store to the same memory location.
2536         if (!Memop->isInvariant()) {
2537           const Value *Ptr = Memop->getValue();
2538           SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2539         }
2540       }
2541       if (ST->hasReadVCCZBug()) {
2542         // This smem read could complete and clobber vccz at any time.
2543         VCCZCorrect = false;
2544       }
2545     }
2546 
2547     updateEventWaitcntAfter(Inst, &ScoreBrackets);
2548 
2549     Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2550 
2551     LLVM_DEBUG({
2552       Inst.print(dbgs());
2553       ScoreBrackets.dump();
2554     });
2555 
2556     // TODO: Remove this work-around after fixing the scheduler and enable the
2557     // assert above.
2558     if (RestoreVCCZ) {
2559       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
2560       // bit is updated, so we can restore the bit by reading the value of
2561       // vcc and then writing it back to the register.
2562       BuildMI(Block, Inst, Inst.getDebugLoc(),
2563               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2564               TRI->getVCC())
2565           .addReg(TRI->getVCC());
2566       VCCZCorrect = true;
2567       Modified = true;
2568     }
2569 
2570     ++Iter;
2571   }
2572 
2573   // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2574   // needed.
2575   AMDGPU::Waitcnt Wait;
2576   if (Block.getFirstTerminator() == Block.end() &&
2577       isPreheaderToFlush(Block, ScoreBrackets)) {
2578     if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2579       Wait.LoadCnt = 0;
2580     if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2581       Wait.SampleCnt = 0;
2582     if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2583       Wait.BvhCnt = 0;
2584   }
2585 
2586   // Combine or remove any redundant waitcnts at the end of the block.
2587   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2588                               OldWaitcntInstr);
2589 
2590   LLVM_DEBUG({
2591     dbgs() << "*** End Block: ";
2592     Block.printName(dbgs());
2593     ScoreBrackets.dump();
2594   });
2595 
2596   return Modified;
2597 }
2598 
2599 // Return true if the given machine basic block is a preheader of a loop in
2600 // which we want to flush the vmcnt counter, and false otherwise.
isPreheaderToFlush(MachineBasicBlock & MBB,const WaitcntBrackets & ScoreBrackets)2601 bool SIInsertWaitcnts::isPreheaderToFlush(
2602     MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2603   auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2604   if (!IsInserted)
2605     return Iterator->second;
2606 
2607   MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2608   if (!Succ)
2609     return false;
2610 
2611   MachineLoop *Loop = MLI->getLoopFor(Succ);
2612   if (!Loop)
2613     return false;
2614 
2615   if (Loop->getLoopPreheader() == &MBB &&
2616       shouldFlushVmCnt(Loop, ScoreBrackets)) {
2617     Iterator->second = true;
2618     return true;
2619   }
2620 
2621   return false;
2622 }
2623 
isVMEMOrFlatVMEM(const MachineInstr & MI) const2624 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2625   if (SIInstrInfo::isFLAT(MI))
2626     return mayAccessVMEMThroughFlat(MI);
2627   return SIInstrInfo::isVMEM(MI);
2628 }
2629 
2630 // Return true if it is better to flush the vmcnt counter in the preheader of
2631 // the given loop. We currently decide to flush in two situations:
2632 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
2633 //    vgpr containing a value that is loaded outside of the loop. (Only on
2634 //    targets with no vscnt counter).
2635 // 2. The loop contains vmem load(s), but the loaded values are not used in the
2636 //    loop, and at least one use of a vgpr containing a value that is loaded
2637 //    outside of the loop.
shouldFlushVmCnt(MachineLoop * ML,const WaitcntBrackets & Brackets)2638 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2639                                         const WaitcntBrackets &Brackets) {
2640   bool HasVMemLoad = false;
2641   bool HasVMemStore = false;
2642   bool UsesVgprLoadedOutside = false;
2643   DenseSet<Register> VgprUse;
2644   DenseSet<Register> VgprDef;
2645 
2646   for (MachineBasicBlock *MBB : ML->blocks()) {
2647     for (MachineInstr &MI : *MBB) {
2648       if (isVMEMOrFlatVMEM(MI)) {
2649         if (MI.mayLoad())
2650           HasVMemLoad = true;
2651         if (MI.mayStore())
2652           HasVMemStore = true;
2653       }
2654       for (const MachineOperand &Op : MI.all_uses()) {
2655         if (!TRI->isVectorRegister(*MRI, Op.getReg()))
2656           continue;
2657         RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2658         // Vgpr use
2659         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2660           // If we find a register that is loaded inside the loop, 1. and 2.
2661           // are invalidated and we can exit.
2662           if (VgprDef.contains(RegNo))
2663             return false;
2664           VgprUse.insert(RegNo);
2665           // If at least one of Op's registers is in the score brackets, the
2666           // value is likely loaded outside of the loop.
2667           if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2668                   Brackets.getScoreLB(LOAD_CNT) ||
2669               Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2670                   Brackets.getScoreLB(SAMPLE_CNT) ||
2671               Brackets.getRegScore(RegNo, BVH_CNT) >
2672                   Brackets.getScoreLB(BVH_CNT)) {
2673             UsesVgprLoadedOutside = true;
2674             break;
2675           }
2676         }
2677       }
2678 
2679       // VMem load vgpr def
2680       if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2681         for (const MachineOperand &Op : MI.all_defs()) {
2682           RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2683           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2684             // If we find a register that is loaded inside the loop, 1. and 2.
2685             // are invalidated and we can exit.
2686             if (VgprUse.contains(RegNo))
2687               return false;
2688             VgprDef.insert(RegNo);
2689           }
2690         }
2691       }
2692     }
2693   }
2694   if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2695     return true;
2696   return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2697 }
2698 
runOnMachineFunction(MachineFunction & MF)2699 bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2700   auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2701   auto *PDT =
2702       &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2703   AliasAnalysis *AA = nullptr;
2704   if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2705     AA = &AAR->getAAResults();
2706 
2707   return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2708 }
2709 
2710 PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)2711 SIInsertWaitcntsPass::run(MachineFunction &MF,
2712                           MachineFunctionAnalysisManager &MFAM) {
2713   auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2714   auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2715   auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
2716                  .getManager()
2717                  .getCachedResult<AAManager>(MF.getFunction());
2718 
2719   if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2720     return PreservedAnalyses::all();
2721 
2722   return getMachineFunctionPassPreservedAnalyses()
2723       .preserveSet<CFGAnalyses>()
2724       .preserve<AAManager>();
2725 }
2726 
run(MachineFunction & MF)2727 bool SIInsertWaitcnts::run(MachineFunction &MF) {
2728   ST = &MF.getSubtarget<GCNSubtarget>();
2729   TII = ST->getInstrInfo();
2730   TRI = &TII->getRegisterInfo();
2731   MRI = &MF.getRegInfo();
2732   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2733 
2734   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
2735 
2736   if (ST->hasExtendedWaitCounts()) {
2737     MaxCounter = NUM_EXTENDED_INST_CNTS;
2738     WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2739     WCG = &WCGGFX12Plus;
2740   } else {
2741     MaxCounter = NUM_NORMAL_INST_CNTS;
2742     WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2743     WCG = &WCGPreGFX12;
2744   }
2745 
2746   for (auto T : inst_counter_types())
2747     ForceEmitWaitcnt[T] = false;
2748 
2749   const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2750 
2751   SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2752 
2753   HardwareLimits Limits = {};
2754   if (ST->hasExtendedWaitCounts()) {
2755     Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2756     Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2757   } else {
2758     Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2759     Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2760   }
2761   Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2762   Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2763   Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2764   Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2765   Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2766   Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
2767 
2768   [[maybe_unused]] unsigned NumVGPRsMax =
2769       ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
2770   [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2771   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2772   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2773 
2774   BlockInfos.clear();
2775   bool Modified = false;
2776 
2777   MachineBasicBlock &EntryBB = MF.front();
2778   MachineBasicBlock::iterator I = EntryBB.begin();
2779 
2780   if (!MFI->isEntryFunction()) {
2781     // Wait for any outstanding memory operations that the input registers may
2782     // depend on. We can't track them and it's better to do the wait after the
2783     // costly call sequence.
2784 
2785     // TODO: Could insert earlier and schedule more liberally with operations
2786     // that only use caller preserved registers.
2787     for (MachineBasicBlock::iterator E = EntryBB.end();
2788          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2789       ;
2790 
2791     if (ST->hasExtendedWaitCounts()) {
2792       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2793           .addImm(0);
2794       for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2795         if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2796           continue;
2797 
2798         if (!ST->hasImageInsts() &&
2799             (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2800           continue;
2801 
2802         BuildMI(EntryBB, I, DebugLoc(),
2803                 TII->get(instrsForExtendedCounterTypes[CT]))
2804             .addImm(0);
2805       }
2806     } else {
2807       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2808     }
2809 
2810     auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2811         ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2812     NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2813     BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2814 
2815     Modified = true;
2816   }
2817 
2818   // Keep iterating over the blocks in reverse post order, inserting and
2819   // updating s_waitcnt where needed, until a fix point is reached.
2820   for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2821     BlockInfos.try_emplace(MBB);
2822 
2823   std::unique_ptr<WaitcntBrackets> Brackets;
2824   bool Repeat;
2825   do {
2826     Repeat = false;
2827 
2828     for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2829          ++BII) {
2830       MachineBasicBlock *MBB = BII->first;
2831       BlockInfo &BI = BII->second;
2832       if (!BI.Dirty)
2833         continue;
2834 
2835       if (BI.Incoming) {
2836         if (!Brackets)
2837           Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2838         else
2839           *Brackets = *BI.Incoming;
2840       } else {
2841         if (!Brackets) {
2842           Brackets = std::make_unique<WaitcntBrackets>(
2843               ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2844         } else {
2845           // Reinitialize in-place. N.B. do not do this by assigning from a
2846           // temporary because the WaitcntBrackets class is large and it could
2847           // cause this function to use an unreasonable amount of stack space.
2848           Brackets->~WaitcntBrackets();
2849           new (Brackets.get()) WaitcntBrackets(
2850               ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2851         }
2852       }
2853 
2854       Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2855       BI.Dirty = false;
2856 
2857       if (Brackets->hasPendingEvent()) {
2858         BlockInfo *MoveBracketsToSucc = nullptr;
2859         for (MachineBasicBlock *Succ : MBB->successors()) {
2860           auto *SuccBII = BlockInfos.find(Succ);
2861           BlockInfo &SuccBI = SuccBII->second;
2862           if (!SuccBI.Incoming) {
2863             SuccBI.Dirty = true;
2864             if (SuccBII <= BII) {
2865               LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2866               Repeat = true;
2867             }
2868             if (!MoveBracketsToSucc) {
2869               MoveBracketsToSucc = &SuccBI;
2870             } else {
2871               SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2872             }
2873           } else if (SuccBI.Incoming->merge(*Brackets)) {
2874             SuccBI.Dirty = true;
2875             if (SuccBII <= BII) {
2876               LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2877               Repeat = true;
2878             }
2879           }
2880         }
2881         if (MoveBracketsToSucc)
2882           MoveBracketsToSucc->Incoming = std::move(Brackets);
2883       }
2884     }
2885   } while (Repeat);
2886 
2887   if (ST->hasScalarStores()) {
2888     SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2889     bool HaveScalarStores = false;
2890 
2891     for (MachineBasicBlock &MBB : MF) {
2892       for (MachineInstr &MI : MBB) {
2893         if (!HaveScalarStores && TII->isScalarStore(MI))
2894           HaveScalarStores = true;
2895 
2896         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2897             MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2898           EndPgmBlocks.push_back(&MBB);
2899       }
2900     }
2901 
2902     if (HaveScalarStores) {
2903       // If scalar writes are used, the cache must be flushed or else the next
2904       // wave to reuse the same scratch memory can be clobbered.
2905       //
2906       // Insert s_dcache_wb at wave termination points if there were any scalar
2907       // stores, and only if the cache hasn't already been flushed. This could
2908       // be improved by looking across blocks for flushes in postdominating
2909       // blocks from the stores but an explicitly requested flush is probably
2910       // very rare.
2911       for (MachineBasicBlock *MBB : EndPgmBlocks) {
2912         bool SeenDCacheWB = false;
2913 
2914         for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2915              I != E; ++I) {
2916           if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2917             SeenDCacheWB = true;
2918           else if (TII->isScalarStore(*I))
2919             SeenDCacheWB = false;
2920 
2921           // FIXME: It would be better to insert this before a waitcnt if any.
2922           if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2923                I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2924               !SeenDCacheWB) {
2925             Modified = true;
2926             BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2927           }
2928         }
2929       }
2930     }
2931   }
2932 
2933   // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2934   // This is done in different ways depending on how the VGPRs were allocated
2935   // (i.e. whether we're in dynamic VGPR mode or not).
2936   // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2937   // waveslot limited kernel runs slower with the deallocation.
2938   if (MFI->isDynamicVGPREnabled()) {
2939     for (MachineInstr *MI : ReleaseVGPRInsts) {
2940       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2941               TII->get(AMDGPU::S_ALLOC_VGPR))
2942           .addImm(0);
2943       Modified = true;
2944     }
2945   } else {
2946     if (!ReleaseVGPRInsts.empty() &&
2947         (MF.getFrameInfo().hasCalls() ||
2948          ST->getOccupancyWithNumVGPRs(
2949              TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
2950              /*IsDynamicVGPR=*/false) <
2951              AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2952       for (MachineInstr *MI : ReleaseVGPRInsts) {
2953         if (ST->requiresNopBeforeDeallocVGPRs()) {
2954           BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2955                   TII->get(AMDGPU::S_NOP))
2956               .addImm(0);
2957         }
2958         BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2959                 TII->get(AMDGPU::S_SENDMSG))
2960             .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2961         Modified = true;
2962       }
2963     }
2964   }
2965   ReleaseVGPRInsts.clear();
2966   PreheadersToFlush.clear();
2967   SLoadAddresses.clear();
2968 
2969   return Modified;
2970 }
2971