xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/PostOrderIterator.h"
33 #include "llvm/ADT/Sequence.h"
34 #include "llvm/CodeGen/MachineLoopInfo.h"
35 #include "llvm/CodeGen/MachinePostDominators.h"
36 #include "llvm/InitializePasses.h"
37 #include "llvm/Support/DebugCounter.h"
38 #include "llvm/Support/TargetParser.h"
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "si-insert-waitcnts"
42 
43 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
44               "Force emit s_waitcnt expcnt(0) instrs");
45 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
46               "Force emit s_waitcnt lgkmcnt(0) instrs");
47 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
48               "Force emit s_waitcnt vmcnt(0) instrs");
49 
50 static cl::opt<bool> ForceEmitZeroFlag(
51   "amdgpu-waitcnt-forcezero",
52   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
53   cl::init(false), cl::Hidden);
54 
55 namespace {
56 // Class of object that encapsulates latest instruction counter score
57 // associated with the operand.  Used for determining whether
58 // s_waitcnt instruction needs to be emitted.
59 
60 #define CNT_MASK(t) (1u << (t))
61 
62 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
63 } // namespace
64 
65 namespace llvm {
66 template <> struct enum_iteration_traits<InstCounterType> {
67   static constexpr bool is_iterable = true;
68 };
69 } // namespace llvm
70 
71 namespace {
72 auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
73 
74 using RegInterval = std::pair<int, int>;
75 
76 struct HardwareLimits {
77   unsigned VmcntMax;
78   unsigned ExpcntMax;
79   unsigned LgkmcntMax;
80   unsigned VscntMax;
81 };
82 
83 struct RegisterEncoding {
84   unsigned VGPR0;
85   unsigned VGPRL;
86   unsigned SGPR0;
87   unsigned SGPRL;
88 };
89 
90 enum WaitEventType {
91   VMEM_ACCESS,       // vector-memory read & write
92   VMEM_READ_ACCESS,  // vector-memory read
93   VMEM_WRITE_ACCESS, // vector-memory write
94   LDS_ACCESS,        // lds read & write
95   GDS_ACCESS,        // gds read & write
96   SQ_MESSAGE,        // send message
97   SMEM_ACCESS,       // scalar-memory read & write
98   EXP_GPR_LOCK,      // export holding on its data src
99   GDS_GPR_LOCK,      // GDS holding on its data and addr src
100   EXP_POS_ACCESS,    // write to export position
101   EXP_PARAM_ACCESS,  // write to export parameter
102   VMW_GPR_LOCK,      // vector-memory write holding on its data src
103   EXP_LDS_ACCESS,    // read by ldsdir counting as export
104   NUM_WAIT_EVENTS,
105 };
106 
107 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108     (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109     (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
110         (1 << SQ_MESSAGE),
111     (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112         (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113     (1 << VMEM_WRITE_ACCESS)};
114 
115 // The mapping is:
116 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
117 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
118 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
119 // We reserve a fixed number of VGPR slots in the scoring tables for
120 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
121 enum RegisterMapping {
122   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
123   AGPR_OFFSET = 256,      // Maximum programmable ArchVGPRs across all targets.
124   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
125   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
126   EXTRA_VGPR_LDS = 0,     // An artificial register to track LDS writes.
127   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
128 };
129 
130 // Enumerate different types of result-returning VMEM operations. Although
131 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
132 // s_waitcnt only instructions of the same VmemType are guaranteed to write
133 // their results in order -- so there is no need to insert an s_waitcnt between
134 // two instructions of the same type that write the same vgpr.
135 enum VmemType {
136   // BUF instructions and MIMG instructions without a sampler.
137   VMEM_NOSAMPLER,
138   // MIMG instructions with a sampler.
139   VMEM_SAMPLER,
140   // BVH instructions
141   VMEM_BVH
142 };
143 
144 VmemType getVmemType(const MachineInstr &Inst) {
145   assert(SIInstrInfo::isVMEM(Inst));
146   if (!SIInstrInfo::isMIMG(Inst))
147     return VMEM_NOSAMPLER;
148   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
149   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
150       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
151   return BaseInfo->BVH ? VMEM_BVH
152                        : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
153 }
154 
155 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
156   switch (T) {
157   case VM_CNT:
158     Wait.VmCnt = std::min(Wait.VmCnt, Count);
159     break;
160   case EXP_CNT:
161     Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
162     break;
163   case LGKM_CNT:
164     Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
165     break;
166   case VS_CNT:
167     Wait.VsCnt = std::min(Wait.VsCnt, Count);
168     break;
169   default:
170     llvm_unreachable("bad InstCounterType");
171   }
172 }
173 
174 // This objects maintains the current score brackets of each wait counter, and
175 // a per-register scoreboard for each wait counter.
176 //
177 // We also maintain the latest score for every event type that can change the
178 // waitcnt in order to know if there are multiple types of events within
179 // the brackets. When multiple types of event happen in the bracket,
180 // wait count may get decreased out of order, therefore we need to put in
181 // "s_waitcnt 0" before use.
182 class WaitcntBrackets {
183 public:
184   WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
185                   RegisterEncoding Encoding)
186       : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
187 
188   unsigned getWaitCountMax(InstCounterType T) const {
189     switch (T) {
190     case VM_CNT:
191       return Limits.VmcntMax;
192     case LGKM_CNT:
193       return Limits.LgkmcntMax;
194     case EXP_CNT:
195       return Limits.ExpcntMax;
196     case VS_CNT:
197       return Limits.VscntMax;
198     default:
199       break;
200     }
201     return 0;
202   }
203 
204   unsigned getScoreLB(InstCounterType T) const {
205     assert(T < NUM_INST_CNTS);
206     return ScoreLBs[T];
207   }
208 
209   unsigned getScoreUB(InstCounterType T) const {
210     assert(T < NUM_INST_CNTS);
211     return ScoreUBs[T];
212   }
213 
214   // Mapping from event to counter.
215   InstCounterType eventCounter(WaitEventType E) {
216     if (WaitEventMaskForInst[VM_CNT] & (1 << E))
217       return VM_CNT;
218     if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
219       return LGKM_CNT;
220     if (WaitEventMaskForInst[VS_CNT] & (1 << E))
221       return VS_CNT;
222     assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
223     return EXP_CNT;
224   }
225 
226   unsigned getRegScore(int GprNo, InstCounterType T) {
227     if (GprNo < NUM_ALL_VGPRS) {
228       return VgprScores[T][GprNo];
229     }
230     assert(T == LGKM_CNT);
231     return SgprScores[GprNo - NUM_ALL_VGPRS];
232   }
233 
234   bool merge(const WaitcntBrackets &Other);
235 
236   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
237                              const MachineRegisterInfo *MRI,
238                              const SIRegisterInfo *TRI, unsigned OpNo) const;
239 
240   bool counterOutOfOrder(InstCounterType T) const;
241   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
242   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
243   void determineWait(InstCounterType T, unsigned ScoreToWait,
244                      AMDGPU::Waitcnt &Wait) const;
245   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
246   void applyWaitcnt(InstCounterType T, unsigned Count);
247   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
248                      const MachineRegisterInfo *MRI, WaitEventType E,
249                      MachineInstr &MI);
250 
251   bool hasPending() const { return PendingEvents != 0; }
252   bool hasPendingEvent(WaitEventType E) const {
253     return PendingEvents & (1 << E);
254   }
255 
256   bool hasMixedPendingEvents(InstCounterType T) const {
257     unsigned Events = PendingEvents & WaitEventMaskForInst[T];
258     // Return true if more than one bit is set in Events.
259     return Events & (Events - 1);
260   }
261 
262   bool hasPendingFlat() const {
263     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
264              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
265             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
266              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
267   }
268 
269   void setPendingFlat() {
270     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
271     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
272   }
273 
274   // Return true if there might be pending writes to the specified vgpr by VMEM
275   // instructions with types different from V.
276   bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
277     assert(GprNo < NUM_ALL_VGPRS);
278     return VgprVmemTypes[GprNo] & ~(1 << V);
279   }
280 
281   void clearVgprVmemTypes(int GprNo) {
282     assert(GprNo < NUM_ALL_VGPRS);
283     VgprVmemTypes[GprNo] = 0;
284   }
285 
286   void print(raw_ostream &);
287   void dump() { print(dbgs()); }
288 
289 private:
290   struct MergeInfo {
291     unsigned OldLB;
292     unsigned OtherLB;
293     unsigned MyShift;
294     unsigned OtherShift;
295   };
296   static bool mergeScore(const MergeInfo &M, unsigned &Score,
297                          unsigned OtherScore);
298 
299   void setScoreLB(InstCounterType T, unsigned Val) {
300     assert(T < NUM_INST_CNTS);
301     ScoreLBs[T] = Val;
302   }
303 
304   void setScoreUB(InstCounterType T, unsigned Val) {
305     assert(T < NUM_INST_CNTS);
306     ScoreUBs[T] = Val;
307     if (T == EXP_CNT) {
308       unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
309       if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
310         ScoreLBs[T] = UB;
311     }
312   }
313 
314   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
315     if (GprNo < NUM_ALL_VGPRS) {
316       VgprUB = std::max(VgprUB, GprNo);
317       VgprScores[T][GprNo] = Val;
318     } else {
319       assert(T == LGKM_CNT);
320       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
321       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
322     }
323   }
324 
325   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
326                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
327                    unsigned OpNo, unsigned Val);
328 
329   const GCNSubtarget *ST = nullptr;
330   HardwareLimits Limits = {};
331   RegisterEncoding Encoding = {};
332   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
333   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
334   unsigned PendingEvents = 0;
335   // Remember the last flat memory operation.
336   unsigned LastFlat[NUM_INST_CNTS] = {0};
337   // wait_cnt scores for every vgpr.
338   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
339   int VgprUB = -1;
340   int SgprUB = -1;
341   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
342   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
343   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
344   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
345   // write to each vgpr.
346   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
347 };
348 
349 class SIInsertWaitcnts : public MachineFunctionPass {
350 private:
351   const GCNSubtarget *ST = nullptr;
352   const SIInstrInfo *TII = nullptr;
353   const SIRegisterInfo *TRI = nullptr;
354   const MachineRegisterInfo *MRI = nullptr;
355   AMDGPU::IsaVersion IV;
356 
357   DenseSet<MachineInstr *> TrackedWaitcntSet;
358   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
359   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
360   MachineLoopInfo *MLI;
361   MachinePostDominatorTree *PDT;
362 
363   struct BlockInfo {
364     MachineBasicBlock *MBB;
365     std::unique_ptr<WaitcntBrackets> Incoming;
366     bool Dirty = true;
367 
368     explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
369   };
370 
371   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
372 
373   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
374   // because of amdgpu-waitcnt-forcezero flag
375   bool ForceEmitZeroWaitcnts;
376   bool ForceEmitWaitcnt[NUM_INST_CNTS];
377 
378 public:
379   static char ID;
380 
381   SIInsertWaitcnts() : MachineFunctionPass(ID) {
382     (void)ForceExpCounter;
383     (void)ForceLgkmCounter;
384     (void)ForceVMCounter;
385   }
386 
387   bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
388   bool isPreheaderToFlush(MachineBasicBlock &MBB,
389                           WaitcntBrackets &ScoreBrackets);
390   bool runOnMachineFunction(MachineFunction &MF) override;
391 
392   StringRef getPassName() const override {
393     return "SI insert wait instructions";
394   }
395 
396   void getAnalysisUsage(AnalysisUsage &AU) const override {
397     AU.setPreservesCFG();
398     AU.addRequired<MachineLoopInfo>();
399     AU.addRequired<MachinePostDominatorTree>();
400     MachineFunctionPass::getAnalysisUsage(AU);
401   }
402 
403   bool isForceEmitWaitcnt() const {
404     for (auto T : inst_counter_types())
405       if (ForceEmitWaitcnt[T])
406         return true;
407     return false;
408   }
409 
410   void setForceEmitWaitcnt() {
411 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
412 // For debug builds, get the debug counter info and adjust if need be
413 #ifndef NDEBUG
414     if (DebugCounter::isCounterSet(ForceExpCounter) &&
415         DebugCounter::shouldExecute(ForceExpCounter)) {
416       ForceEmitWaitcnt[EXP_CNT] = true;
417     } else {
418       ForceEmitWaitcnt[EXP_CNT] = false;
419     }
420 
421     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
422         DebugCounter::shouldExecute(ForceLgkmCounter)) {
423       ForceEmitWaitcnt[LGKM_CNT] = true;
424     } else {
425       ForceEmitWaitcnt[LGKM_CNT] = false;
426     }
427 
428     if (DebugCounter::isCounterSet(ForceVMCounter) &&
429         DebugCounter::shouldExecute(ForceVMCounter)) {
430       ForceEmitWaitcnt[VM_CNT] = true;
431     } else {
432       ForceEmitWaitcnt[VM_CNT] = false;
433     }
434 #endif // NDEBUG
435   }
436 
437   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
438   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
439   bool generateWaitcntInstBefore(MachineInstr &MI,
440                                  WaitcntBrackets &ScoreBrackets,
441                                  MachineInstr *OldWaitcntInstr,
442                                  bool FlushVmCnt);
443   bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
444                                WaitcntBrackets &ScoreBrackets,
445                                MachineInstr *OldWaitcntInstr);
446   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
447                        MachineBasicBlock::instr_iterator It,
448                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
449                        MachineInstr *OldWaitcntInstr);
450   void updateEventWaitcntAfter(MachineInstr &Inst,
451                                WaitcntBrackets *ScoreBrackets);
452   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
453                             WaitcntBrackets &ScoreBrackets);
454   bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
455                                MachineInstr &OldWaitcntInstr,
456                                AMDGPU::Waitcnt &Wait,
457                                MachineBasicBlock::instr_iterator It);
458 };
459 
460 } // end anonymous namespace
461 
462 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
463                                             const SIInstrInfo *TII,
464                                             const MachineRegisterInfo *MRI,
465                                             const SIRegisterInfo *TRI,
466                                             unsigned OpNo) const {
467   const MachineOperand &Op = MI->getOperand(OpNo);
468   if (!TRI->isInAllocatableClass(Op.getReg()))
469     return {-1, -1};
470 
471   // A use via a PW operand does not need a waitcnt.
472   // A partial write is not a WAW.
473   assert(!Op.getSubReg() || !Op.isUndef());
474 
475   RegInterval Result;
476 
477   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
478 
479   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
480     assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
481     Result.first = Reg - Encoding.VGPR0;
482     if (TRI->isAGPR(*MRI, Op.getReg()))
483       Result.first += AGPR_OFFSET;
484     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
485   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
486     assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
487     Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
488     assert(Result.first >= NUM_ALL_VGPRS &&
489            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
490   }
491   // TODO: Handle TTMP
492   // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
493   else
494     return {-1, -1};
495 
496   const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
497   unsigned Size = TRI->getRegSizeInBits(*RC);
498   Result.second = Result.first + ((Size + 16) / 32);
499 
500   return Result;
501 }
502 
503 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
504                                   const SIInstrInfo *TII,
505                                   const SIRegisterInfo *TRI,
506                                   const MachineRegisterInfo *MRI, unsigned OpNo,
507                                   unsigned Val) {
508   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
509   assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
510   for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
511     setRegScore(RegNo, EXP_CNT, Val);
512   }
513 }
514 
515 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
516 // can be accessed. A load from LDS to VMEM does not need a wait.
517 static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
518   return SIInstrInfo::isVALU(MI) &&
519          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
520          MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
521 }
522 
523 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
524                                     const SIRegisterInfo *TRI,
525                                     const MachineRegisterInfo *MRI,
526                                     WaitEventType E, MachineInstr &Inst) {
527   InstCounterType T = eventCounter(E);
528   unsigned CurrScore = getScoreUB(T) + 1;
529   if (CurrScore == 0)
530     report_fatal_error("InsertWaitcnt score wraparound");
531   // PendingEvents and ScoreUB need to be update regardless if this event
532   // changes the score of a register or not.
533   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
534   PendingEvents |= 1 << E;
535   setScoreUB(T, CurrScore);
536 
537   if (T == EXP_CNT) {
538     // Put score on the source vgprs. If this is a store, just use those
539     // specific register(s).
540     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
541       int AddrOpIdx =
542           AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
543       // All GDS operations must protect their address register (same as
544       // export.)
545       if (AddrOpIdx != -1) {
546         setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
547       }
548 
549       if (Inst.mayStore()) {
550         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
551                                        AMDGPU::OpName::data0) != -1) {
552           setExpScore(
553               &Inst, TII, TRI, MRI,
554               AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
555               CurrScore);
556         }
557         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
558                                        AMDGPU::OpName::data1) != -1) {
559           setExpScore(&Inst, TII, TRI, MRI,
560                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
561                                                  AMDGPU::OpName::data1),
562                       CurrScore);
563         }
564       } else if (SIInstrInfo::isAtomicRet(Inst) &&
565                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
566                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
567                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
568                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
569                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
570                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
571                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
572                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
573         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
574           const MachineOperand &Op = Inst.getOperand(I);
575           if (Op.isReg() && !Op.isDef() &&
576               TRI->isVectorRegister(*MRI, Op.getReg())) {
577             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
578           }
579         }
580       }
581     } else if (TII->isFLAT(Inst)) {
582       if (Inst.mayStore()) {
583         setExpScore(
584             &Inst, TII, TRI, MRI,
585             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
586             CurrScore);
587       } else if (SIInstrInfo::isAtomicRet(Inst)) {
588         setExpScore(
589             &Inst, TII, TRI, MRI,
590             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
591             CurrScore);
592       }
593     } else if (TII->isMIMG(Inst)) {
594       if (Inst.mayStore()) {
595         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
596       } else if (SIInstrInfo::isAtomicRet(Inst)) {
597         setExpScore(
598             &Inst, TII, TRI, MRI,
599             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
600             CurrScore);
601       }
602     } else if (TII->isMTBUF(Inst)) {
603       if (Inst.mayStore()) {
604         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
605       }
606     } else if (TII->isMUBUF(Inst)) {
607       if (Inst.mayStore()) {
608         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
609       } else if (SIInstrInfo::isAtomicRet(Inst)) {
610         setExpScore(
611             &Inst, TII, TRI, MRI,
612             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
613             CurrScore);
614       }
615     } else if (TII->isLDSDIR(Inst)) {
616       // LDSDIR instructions attach the score to the destination.
617       setExpScore(
618           &Inst, TII, TRI, MRI,
619           AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
620           CurrScore);
621     } else {
622       if (TII->isEXP(Inst)) {
623         // For export the destination registers are really temps that
624         // can be used as the actual source after export patching, so
625         // we need to treat them like sources and set the EXP_CNT
626         // score.
627         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
628           MachineOperand &DefMO = Inst.getOperand(I);
629           if (DefMO.isReg() && DefMO.isDef() &&
630               TRI->isVGPR(*MRI, DefMO.getReg())) {
631             setRegScore(
632                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
633                 EXP_CNT, CurrScore);
634           }
635         }
636       }
637       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
638         MachineOperand &MO = Inst.getOperand(I);
639         if (MO.isReg() && !MO.isDef() &&
640             TRI->isVectorRegister(*MRI, MO.getReg())) {
641           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
642         }
643       }
644     }
645 #if 0 // TODO: check if this is handled by MUBUF code above.
646   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
647        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
648        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
649     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
650     unsigned OpNo;//TODO: find the OpNo for this operand;
651     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
652     for (int RegNo = Interval.first; RegNo < Interval.second;
653     ++RegNo) {
654       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
655     }
656 #endif
657   } else {
658     // Match the score to the destination registers.
659     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
660       auto &Op = Inst.getOperand(I);
661       if (!Op.isReg() || !Op.isDef())
662         continue;
663       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
664       if (T == VM_CNT) {
665         if (Interval.first >= NUM_ALL_VGPRS)
666           continue;
667         if (SIInstrInfo::isVMEM(Inst)) {
668           VmemType V = getVmemType(Inst);
669           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
670             VgprVmemTypes[RegNo] |= 1 << V;
671         }
672       }
673       for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
674         setRegScore(RegNo, T, CurrScore);
675       }
676     }
677     if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
678       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
679     }
680   }
681 }
682 
683 void WaitcntBrackets::print(raw_ostream &OS) {
684   OS << '\n';
685   for (auto T : inst_counter_types()) {
686     unsigned LB = getScoreLB(T);
687     unsigned UB = getScoreUB(T);
688 
689     switch (T) {
690     case VM_CNT:
691       OS << "    VM_CNT(" << UB - LB << "): ";
692       break;
693     case LGKM_CNT:
694       OS << "    LGKM_CNT(" << UB - LB << "): ";
695       break;
696     case EXP_CNT:
697       OS << "    EXP_CNT(" << UB - LB << "): ";
698       break;
699     case VS_CNT:
700       OS << "    VS_CNT(" << UB - LB << "): ";
701       break;
702     default:
703       OS << "    UNKNOWN(" << UB - LB << "): ";
704       break;
705     }
706 
707     if (LB < UB) {
708       // Print vgpr scores.
709       for (int J = 0; J <= VgprUB; J++) {
710         unsigned RegScore = getRegScore(J, T);
711         if (RegScore <= LB)
712           continue;
713         unsigned RelScore = RegScore - LB - 1;
714         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
715           OS << RelScore << ":v" << J << " ";
716         } else {
717           OS << RelScore << ":ds ";
718         }
719       }
720       // Also need to print sgpr scores for lgkm_cnt.
721       if (T == LGKM_CNT) {
722         for (int J = 0; J <= SgprUB; J++) {
723           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
724           if (RegScore <= LB)
725             continue;
726           unsigned RelScore = RegScore - LB - 1;
727           OS << RelScore << ":s" << J << " ";
728         }
729       }
730     }
731     OS << '\n';
732   }
733   OS << '\n';
734 }
735 
736 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
737 /// whether a waitcnt instruction is needed at all.
738 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
739   simplifyWaitcnt(VM_CNT, Wait.VmCnt);
740   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
741   simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
742   simplifyWaitcnt(VS_CNT, Wait.VsCnt);
743 }
744 
745 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
746                                       unsigned &Count) const {
747   const unsigned LB = getScoreLB(T);
748   const unsigned UB = getScoreUB(T);
749 
750   // The number of outstanding events for this type, T, can be calculated
751   // as (UB - LB). If the current Count is greater than or equal to the number
752   // of outstanding events, then the wait for this counter is redundant.
753   if (Count >= UB - LB)
754     Count = ~0u;
755 }
756 
757 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
758                                     AMDGPU::Waitcnt &Wait) const {
759   // If the score of src_operand falls within the bracket, we need an
760   // s_waitcnt instruction.
761   const unsigned LB = getScoreLB(T);
762   const unsigned UB = getScoreUB(T);
763   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
764     if ((T == VM_CNT || T == LGKM_CNT) &&
765         hasPendingFlat() &&
766         !ST->hasFlatLgkmVMemCountInOrder()) {
767       // If there is a pending FLAT operation, and this is a VMem or LGKM
768       // waitcnt and the target can report early completion, then we need
769       // to force a waitcnt 0.
770       addWait(Wait, T, 0);
771     } else if (counterOutOfOrder(T)) {
772       // Counter can get decremented out-of-order when there
773       // are multiple types event in the bracket. Also emit an s_wait counter
774       // with a conservative value of 0 for the counter.
775       addWait(Wait, T, 0);
776     } else {
777       // If a counter has been maxed out avoid overflow by waiting for
778       // MAX(CounterType) - 1 instead.
779       unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
780       addWait(Wait, T, NeededWait);
781     }
782   }
783 }
784 
785 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
786   applyWaitcnt(VM_CNT, Wait.VmCnt);
787   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
788   applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
789   applyWaitcnt(VS_CNT, Wait.VsCnt);
790 }
791 
792 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
793   const unsigned UB = getScoreUB(T);
794   if (Count >= UB)
795     return;
796   if (Count != 0) {
797     if (counterOutOfOrder(T))
798       return;
799     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
800   } else {
801     setScoreLB(T, UB);
802     PendingEvents &= ~WaitEventMaskForInst[T];
803   }
804 }
805 
806 // Where there are multiple types of event in the bracket of a counter,
807 // the decrement may go out of order.
808 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
809   // Scalar memory read always can go out of order.
810   if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
811     return true;
812   return hasMixedPendingEvents(T);
813 }
814 
815 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
816                       false)
817 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
818 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
819 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
820                     false)
821 
822 char SIInsertWaitcnts::ID = 0;
823 
824 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
825 
826 FunctionPass *llvm::createSIInsertWaitcntsPass() {
827   return new SIInsertWaitcnts();
828 }
829 
830 /// Combine consecutive waitcnt instructions that precede \p It and follow
831 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
832 /// by previous passes. Currently this pass conservatively assumes that these
833 /// preexisting waitcnt are required for correctness.
834 bool SIInsertWaitcnts::applyPreexistingWaitcnt(
835     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
836     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) {
837   bool Modified = false;
838   MachineInstr *WaitcntInstr = nullptr;
839   MachineInstr *WaitcntVsCntInstr = nullptr;
840 
841   for (auto &II :
842        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
843     if (II.isMetaInstruction())
844       continue;
845 
846     if (II.getOpcode() == AMDGPU::S_WAITCNT) {
847       // Conservatively update required wait if this waitcnt was added in an
848       // earlier pass. In this case it will not exist in the tracked waitcnt
849       // set.
850       if (!TrackedWaitcntSet.count(&II)) {
851         unsigned IEnc = II.getOperand(0).getImm();
852         AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
853         Wait = Wait.combined(OldWait);
854       }
855 
856       // Merge consecutive waitcnt of the same type by erasing multiples.
857       if (!WaitcntInstr) {
858         WaitcntInstr = &II;
859       } else {
860         II.eraseFromParent();
861         Modified = true;
862       }
863 
864     } else {
865       assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
866       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
867       if (!TrackedWaitcntSet.count(&II)) {
868         unsigned OldVSCnt =
869             TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
870         Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
871       }
872 
873       if (!WaitcntVsCntInstr) {
874         WaitcntVsCntInstr = &II;
875       } else {
876         II.eraseFromParent();
877         Modified = true;
878       }
879     }
880   }
881 
882   // Updated encoding of merged waitcnt with the required wait.
883   if (WaitcntInstr) {
884     if (Wait.hasWaitExceptVsCnt()) {
885       unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
886       unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
887       if (OldEnc != NewEnc) {
888         WaitcntInstr->getOperand(0).setImm(NewEnc);
889         Modified = true;
890       }
891       ScoreBrackets.applyWaitcnt(Wait);
892       Wait.VmCnt = ~0u;
893       Wait.LgkmCnt = ~0u;
894       Wait.ExpCnt = ~0u;
895 
896       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
897                      ? dbgs() << "applyPreexistingWaitcnt\n"
898                               << "New Instr at block end: " << *WaitcntInstr
899                               << '\n'
900                      : dbgs() << "applyPreexistingWaitcnt\n"
901                               << "Old Instr: " << *It
902                               << "New Instr: " << *WaitcntInstr << '\n');
903 
904     } else {
905       WaitcntInstr->eraseFromParent();
906       Modified = true;
907     }
908   }
909 
910   if (WaitcntVsCntInstr) {
911     if (Wait.hasWaitVsCnt()) {
912       assert(ST->hasVscnt());
913       unsigned OldVSCnt =
914           TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
915               ->getImm();
916       if (Wait.VsCnt != OldVSCnt) {
917         TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
918             ->setImm(Wait.VsCnt);
919         Modified = true;
920       }
921       ScoreBrackets.applyWaitcnt(Wait);
922       Wait.VsCnt = ~0u;
923 
924       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
925                      ? dbgs() << "applyPreexistingWaitcnt\n"
926                               << "New Instr at block end: "
927                               << *WaitcntVsCntInstr << '\n'
928                      : dbgs() << "applyPreexistingWaitcnt\n"
929                               << "Old Instr: " << *It
930                               << "New Instr: " << *WaitcntVsCntInstr << '\n');
931     } else {
932       WaitcntVsCntInstr->eraseFromParent();
933       Modified = true;
934     }
935   }
936 
937   return Modified;
938 }
939 
940 static bool readsVCCZ(const MachineInstr &MI) {
941   unsigned Opc = MI.getOpcode();
942   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
943          !MI.getOperand(1).isUndef();
944 }
945 
946 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
947 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
948   // Currently all conventions wait, but this may not always be the case.
949   //
950   // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
951   // senses to omit the wait and do it in the caller.
952   return true;
953 }
954 
955 /// \returns true if the callee is expected to wait for any outstanding waits
956 /// before returning.
957 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
958   return true;
959 }
960 
961 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
962 ///  Instructions of a given type are returned in order,
963 ///  but instructions of different types can complete out of order.
964 ///  We rely on this in-order completion
965 ///  and simply assign a score to the memory access instructions.
966 ///  We keep track of the active "score bracket" to determine
967 ///  if an access of a memory read requires an s_waitcnt
968 ///  and if so what the value of each counter is.
969 ///  The "score bracket" is bound by the lower bound and upper bound
970 ///  scores (*_score_LB and *_score_ub respectively).
971 ///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
972 ///  flush the vmcnt counter here.
973 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
974                                                  WaitcntBrackets &ScoreBrackets,
975                                                  MachineInstr *OldWaitcntInstr,
976                                                  bool FlushVmCnt) {
977   setForceEmitWaitcnt();
978 
979   if (MI.isMetaInstruction())
980     return false;
981 
982   AMDGPU::Waitcnt Wait;
983 
984   // FIXME: This should have already been handled by the memory legalizer.
985   // Removing this currently doesn't affect any lit tests, but we need to
986   // verify that nothing was relying on this. The number of buffer invalidates
987   // being handled here should not be expanded.
988   if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
989       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
990       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
991       MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
992       MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
993     Wait.VmCnt = 0;
994   }
995 
996   // All waits must be resolved at call return.
997   // NOTE: this could be improved with knowledge of all call sites or
998   //   with knowledge of the called routines.
999   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1000       MI.getOpcode() == AMDGPU::SI_RETURN ||
1001       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1002       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1003     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1004   }
1005   // Resolve vm waits before gs-done.
1006   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1007             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1008            ST->hasLegacyGeometry() &&
1009            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1010             AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1011     Wait.VmCnt = 0;
1012   }
1013 #if 0 // TODO: the following blocks of logic when we have fence.
1014   else if (MI.getOpcode() == SC_FENCE) {
1015     const unsigned int group_size =
1016       context->shader_info->GetMaxThreadGroupSize();
1017     // group_size == 0 means thread group size is unknown at compile time
1018     const bool group_is_multi_wave =
1019       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1020     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1021 
1022     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1023       SCRegType src_type = Inst->GetSrcType(i);
1024       switch (src_type) {
1025         case SCMEM_LDS:
1026           if (group_is_multi_wave ||
1027             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1028             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1029                                ScoreBrackets->getScoreUB(LGKM_CNT));
1030             // LDS may have to wait for VM_CNT after buffer load to LDS
1031             if (target_info->HasBufferLoadToLDS()) {
1032               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1033                                  ScoreBrackets->getScoreUB(VM_CNT));
1034             }
1035           }
1036           break;
1037 
1038         case SCMEM_GDS:
1039           if (group_is_multi_wave || fence_is_global) {
1040             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1041               ScoreBrackets->getScoreUB(EXP_CNT));
1042             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1043               ScoreBrackets->getScoreUB(LGKM_CNT));
1044           }
1045           break;
1046 
1047         case SCMEM_UAV:
1048         case SCMEM_TFBUF:
1049         case SCMEM_RING:
1050         case SCMEM_SCATTER:
1051           if (group_is_multi_wave || fence_is_global) {
1052             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1053               ScoreBrackets->getScoreUB(EXP_CNT));
1054             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1055               ScoreBrackets->getScoreUB(VM_CNT));
1056           }
1057           break;
1058 
1059         case SCMEM_SCRATCH:
1060         default:
1061           break;
1062       }
1063     }
1064   }
1065 #endif
1066 
1067   // Export & GDS instructions do not read the EXEC mask until after the export
1068   // is granted (which can occur well after the instruction is issued).
1069   // The shader program must flush all EXP operations on the export-count
1070   // before overwriting the EXEC mask.
1071   else {
1072     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1073       // Export and GDS are tracked individually, either may trigger a waitcnt
1074       // for EXEC.
1075       if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1076           ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1077           ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1078           ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1079         Wait.ExpCnt = 0;
1080       }
1081     }
1082 
1083     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1084       // The function is going to insert a wait on everything in its prolog.
1085       // This still needs to be careful if the call target is a load (e.g. a GOT
1086       // load). We also need to check WAW dependency with saved PC.
1087       Wait = AMDGPU::Waitcnt();
1088 
1089       int CallAddrOpIdx =
1090           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1091 
1092       if (MI.getOperand(CallAddrOpIdx).isReg()) {
1093         RegInterval CallAddrOpInterval =
1094           ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1095 
1096         for (int RegNo = CallAddrOpInterval.first;
1097              RegNo < CallAddrOpInterval.second; ++RegNo)
1098           ScoreBrackets.determineWait(
1099             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1100 
1101         int RtnAddrOpIdx =
1102           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1103         if (RtnAddrOpIdx != -1) {
1104           RegInterval RtnAddrOpInterval =
1105             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1106 
1107           for (int RegNo = RtnAddrOpInterval.first;
1108                RegNo < RtnAddrOpInterval.second; ++RegNo)
1109             ScoreBrackets.determineWait(
1110               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1111         }
1112       }
1113     } else {
1114       // FIXME: Should not be relying on memoperands.
1115       // Look at the source operands of every instruction to see if
1116       // any of them results from a previous memory operation that affects
1117       // its current usage. If so, an s_waitcnt instruction needs to be
1118       // emitted.
1119       // If the source operand was defined by a load, add the s_waitcnt
1120       // instruction.
1121       //
1122       // Two cases are handled for destination operands:
1123       // 1) If the destination operand was defined by a load, add the s_waitcnt
1124       // instruction to guarantee the right WAW order.
1125       // 2) If a destination operand that was used by a recent export/store ins,
1126       // add s_waitcnt on exp_cnt to guarantee the WAR order.
1127       for (const MachineMemOperand *Memop : MI.memoperands()) {
1128         const Value *Ptr = Memop->getValue();
1129         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1130           addWait(Wait, LGKM_CNT, 0);
1131           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1132             SLoadAddresses.erase(Ptr);
1133         }
1134         unsigned AS = Memop->getAddrSpace();
1135         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1136           continue;
1137         // No need to wait before load from VMEM to LDS.
1138         if (mayWriteLDSThroughDMA(MI))
1139           continue;
1140         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1141         // VM_CNT is only relevant to vgpr or LDS.
1142         ScoreBrackets.determineWait(
1143             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1144         if (Memop->isStore()) {
1145           ScoreBrackets.determineWait(
1146               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1147         }
1148       }
1149 
1150       // Loop over use and def operands.
1151       for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1152         MachineOperand &Op = MI.getOperand(I);
1153         if (!Op.isReg())
1154           continue;
1155         RegInterval Interval =
1156             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1157 
1158         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1159         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1160           if (IsVGPR) {
1161             // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1162             // previous write and this write are the same type of VMEM
1163             // instruction, in which case they're guaranteed to write their
1164             // results in order anyway.
1165             if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1166                 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1167                                                        getVmemType(MI))) {
1168               ScoreBrackets.determineWait(
1169                   VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1170               ScoreBrackets.clearVgprVmemTypes(RegNo);
1171             }
1172             if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1173               ScoreBrackets.determineWait(
1174                   EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1175             }
1176           }
1177           ScoreBrackets.determineWait(
1178               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1179         }
1180       }
1181     }
1182   }
1183 
1184   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1185   // occurs before the instruction. Doing it here prevents any additional
1186   // S_WAITCNTs from being emitted if the instruction was marked as
1187   // requiring a WAITCNT beforehand.
1188   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1189       !ST->hasAutoWaitcntBeforeBarrier()) {
1190     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1191   }
1192 
1193   // TODO: Remove this work-around, enable the assert for Bug 457939
1194   //       after fixing the scheduler. Also, the Shader Compiler code is
1195   //       independent of target.
1196   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1197     if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1198             ScoreBrackets.getScoreUB(LGKM_CNT) &&
1199         ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1200       Wait.LgkmCnt = 0;
1201     }
1202   }
1203 
1204   // Verify that the wait is actually needed.
1205   ScoreBrackets.simplifyWaitcnt(Wait);
1206 
1207   if (ForceEmitZeroWaitcnts)
1208     Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1209 
1210   if (ForceEmitWaitcnt[VM_CNT])
1211     Wait.VmCnt = 0;
1212   if (ForceEmitWaitcnt[EXP_CNT])
1213     Wait.ExpCnt = 0;
1214   if (ForceEmitWaitcnt[LGKM_CNT])
1215     Wait.LgkmCnt = 0;
1216   if (ForceEmitWaitcnt[VS_CNT])
1217     Wait.VsCnt = 0;
1218 
1219   if (FlushVmCnt) {
1220     unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
1221     unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
1222     if (UB - LB != 0)
1223       Wait.VmCnt = 0;
1224   }
1225 
1226   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1227                          OldWaitcntInstr);
1228 }
1229 
1230 // Add a waitcnt to flush the vmcnt counter at the end of the given block if
1231 // needed.
1232 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1233                                                WaitcntBrackets &ScoreBrackets,
1234                                                MachineInstr *OldWaitcntInstr) {
1235   AMDGPU::Waitcnt Wait;
1236 
1237   unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
1238   unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
1239   if (UB - LB == 0)
1240     return false;
1241 
1242   Wait.VmCnt = 0;
1243 
1244   return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1245                          OldWaitcntInstr);
1246 }
1247 
1248 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1249                                        MachineBasicBlock::instr_iterator It,
1250                                        MachineBasicBlock &Block,
1251                                        WaitcntBrackets &ScoreBrackets,
1252                                        MachineInstr *OldWaitcntInstr) {
1253   bool Modified = false;
1254   const DebugLoc &DL = Block.findDebugLoc(It);
1255 
1256   if (OldWaitcntInstr)
1257     // Try to merge the required wait with preexisting waitcnt instructions.
1258     // Also erase redundant waitcnt.
1259     Modified =
1260         applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1261   else
1262     ScoreBrackets.applyWaitcnt(Wait);
1263 
1264   // ExpCnt can be merged into VINTERP.
1265   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1266       SIInstrInfo::isVINTERP(*It)) {
1267     MachineOperand *WaitExp =
1268         TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1269     if (Wait.ExpCnt < WaitExp->getImm()) {
1270       WaitExp->setImm(Wait.ExpCnt);
1271       Modified = true;
1272     }
1273     Wait.ExpCnt = ~0u;
1274 
1275     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1276                       << "Update Instr: " << *It);
1277   }
1278 
1279   // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1280   // instruction was modified to handle the required wait.
1281   if (Wait.hasWaitExceptVsCnt()) {
1282     unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1283     auto SWaitInst =
1284         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1285     TrackedWaitcntSet.insert(SWaitInst);
1286     Modified = true;
1287 
1288     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1289                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1290                dbgs() << "New Instr: " << *SWaitInst << '\n');
1291   }
1292 
1293   if (Wait.hasWaitVsCnt()) {
1294     assert(ST->hasVscnt());
1295 
1296     auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1297                          .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1298                          .addImm(Wait.VsCnt);
1299     TrackedWaitcntSet.insert(SWaitInst);
1300     Modified = true;
1301 
1302     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1303                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1304                dbgs() << "New Instr: " << *SWaitInst << '\n');
1305   }
1306   return Modified;
1307 }
1308 
1309 // This is a flat memory operation. Check to see if it has memory tokens other
1310 // than LDS. Other address spaces supported by flat memory operations involve
1311 // global memory.
1312 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1313   assert(TII->isFLAT(MI));
1314 
1315   // All flat instructions use the VMEM counter.
1316   assert(TII->usesVM_CNT(MI));
1317 
1318   // If there are no memory operands then conservatively assume the flat
1319   // operation may access VMEM.
1320   if (MI.memoperands_empty())
1321     return true;
1322 
1323   // See if any memory operand specifies an address space that involves VMEM.
1324   // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1325   // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1326   // (GDS) address space is not supported by flat operations. Therefore, simply
1327   // return true unless only the LDS address space is found.
1328   for (const MachineMemOperand *Memop : MI.memoperands()) {
1329     unsigned AS = Memop->getAddrSpace();
1330     assert(AS != AMDGPUAS::REGION_ADDRESS);
1331     if (AS != AMDGPUAS::LOCAL_ADDRESS)
1332       return true;
1333   }
1334 
1335   return false;
1336 }
1337 
1338 // This is a flat memory operation. Check to see if it has memory tokens for
1339 // either LDS or FLAT.
1340 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1341   assert(TII->isFLAT(MI));
1342 
1343   // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1344   if (!TII->usesLGKM_CNT(MI))
1345     return false;
1346 
1347   // If in tgsplit mode then there can be no use of LDS.
1348   if (ST->isTgSplitEnabled())
1349     return false;
1350 
1351   // If there are no memory operands then conservatively assume the flat
1352   // operation may access LDS.
1353   if (MI.memoperands_empty())
1354     return true;
1355 
1356   // See if any memory operand specifies an address space that involves LDS.
1357   for (const MachineMemOperand *Memop : MI.memoperands()) {
1358     unsigned AS = Memop->getAddrSpace();
1359     if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1360       return true;
1361   }
1362 
1363   return false;
1364 }
1365 
1366 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1367                                                WaitcntBrackets *ScoreBrackets) {
1368   // Now look at the instruction opcode. If it is a memory access
1369   // instruction, update the upper-bound of the appropriate counter's
1370   // bracket and the destination operand scores.
1371   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1372   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1373     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1374         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1375       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1376       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1377     } else {
1378       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1379     }
1380   } else if (TII->isFLAT(Inst)) {
1381     assert(Inst.mayLoadOrStore());
1382 
1383     int FlatASCount = 0;
1384 
1385     if (mayAccessVMEMThroughFlat(Inst)) {
1386       ++FlatASCount;
1387       if (!ST->hasVscnt())
1388         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1389       else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1390         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1391       else
1392         ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1393     }
1394 
1395     if (mayAccessLDSThroughFlat(Inst)) {
1396       ++FlatASCount;
1397       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1398     }
1399 
1400     // A Flat memory operation must access at least one address space.
1401     assert(FlatASCount);
1402 
1403     // This is a flat memory operation that access both VMEM and LDS, so note it
1404     // - it will require that both the VM and LGKM be flushed to zero if it is
1405     // pending when a VM or LGKM dependency occurs.
1406     if (FlatASCount > 1)
1407       ScoreBrackets->setPendingFlat();
1408   } else if (SIInstrInfo::isVMEM(Inst) &&
1409              !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
1410     if (!ST->hasVscnt())
1411       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1412     else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1413              /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1414              (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1415       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1416     else if (Inst.mayStore())
1417       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1418 
1419     if (ST->vmemWriteNeedsExpWaitcnt() &&
1420         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1421       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1422     }
1423   } else if (TII->isSMRD(Inst)) {
1424     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1425   } else if (Inst.isCall()) {
1426     if (callWaitsOnFunctionReturn(Inst)) {
1427       // Act as a wait on everything
1428       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1429     } else {
1430       // May need to way wait for anything.
1431       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1432     }
1433   } else if (SIInstrInfo::isLDSDIR(Inst)) {
1434     ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1435   } else if (TII->isVINTERP(Inst)) {
1436     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1437     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1438   } else if (SIInstrInfo::isEXP(Inst)) {
1439     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1440     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1441       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1442     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1443       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1444     else
1445       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1446   } else {
1447     switch (Inst.getOpcode()) {
1448     case AMDGPU::S_SENDMSG:
1449     case AMDGPU::S_SENDMSG_RTN_B32:
1450     case AMDGPU::S_SENDMSG_RTN_B64:
1451     case AMDGPU::S_SENDMSGHALT:
1452       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1453       break;
1454     case AMDGPU::S_MEMTIME:
1455     case AMDGPU::S_MEMREALTIME:
1456       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1457       break;
1458     }
1459   }
1460 }
1461 
1462 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1463                                  unsigned OtherScore) {
1464   unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1465   unsigned OtherShifted =
1466       OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1467   Score = std::max(MyShifted, OtherShifted);
1468   return OtherShifted > MyShifted;
1469 }
1470 
1471 /// Merge the pending events and associater score brackets of \p Other into
1472 /// this brackets status.
1473 ///
1474 /// Returns whether the merge resulted in a change that requires tighter waits
1475 /// (i.e. the merged brackets strictly dominate the original brackets).
1476 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1477   bool StrictDom = false;
1478 
1479   VgprUB = std::max(VgprUB, Other.VgprUB);
1480   SgprUB = std::max(SgprUB, Other.SgprUB);
1481 
1482   for (auto T : inst_counter_types()) {
1483     // Merge event flags for this counter
1484     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1485     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1486     if (OtherEvents & ~OldEvents)
1487       StrictDom = true;
1488     PendingEvents |= OtherEvents;
1489 
1490     // Merge scores for this counter
1491     const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1492     const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1493     const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1494     if (NewUB < ScoreLBs[T])
1495       report_fatal_error("waitcnt score overflow");
1496 
1497     MergeInfo M;
1498     M.OldLB = ScoreLBs[T];
1499     M.OtherLB = Other.ScoreLBs[T];
1500     M.MyShift = NewUB - ScoreUBs[T];
1501     M.OtherShift = NewUB - Other.ScoreUBs[T];
1502 
1503     ScoreUBs[T] = NewUB;
1504 
1505     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1506 
1507     bool RegStrictDom = false;
1508     for (int J = 0; J <= VgprUB; J++) {
1509       RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1510     }
1511 
1512     if (T == VM_CNT) {
1513       for (int J = 0; J <= VgprUB; J++) {
1514         unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1515         RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1516         VgprVmemTypes[J] = NewVmemTypes;
1517       }
1518     }
1519 
1520     if (T == LGKM_CNT) {
1521       for (int J = 0; J <= SgprUB; J++) {
1522         RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1523       }
1524     }
1525 
1526     if (RegStrictDom)
1527       StrictDom = true;
1528   }
1529 
1530   return StrictDom;
1531 }
1532 
1533 // Generate s_waitcnt instructions where needed.
1534 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1535                                             MachineBasicBlock &Block,
1536                                             WaitcntBrackets &ScoreBrackets) {
1537   bool Modified = false;
1538 
1539   LLVM_DEBUG({
1540     dbgs() << "*** Block" << Block.getNumber() << " ***";
1541     ScoreBrackets.dump();
1542   });
1543 
1544   // Track the correctness of vccz through this basic block. There are two
1545   // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1546   // ST->partialVCCWritesUpdateVCCZ().
1547   bool VCCZCorrect = true;
1548   if (ST->hasReadVCCZBug()) {
1549     // vccz could be incorrect at a basic block boundary if a predecessor wrote
1550     // to vcc and then issued an smem load.
1551     VCCZCorrect = false;
1552   } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1553     // vccz could be incorrect at a basic block boundary if a predecessor wrote
1554     // to vcc_lo or vcc_hi.
1555     VCCZCorrect = false;
1556   }
1557 
1558   // Walk over the instructions.
1559   MachineInstr *OldWaitcntInstr = nullptr;
1560 
1561   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1562                                          E = Block.instr_end();
1563        Iter != E;) {
1564     MachineInstr &Inst = *Iter;
1565 
1566     // Track pre-existing waitcnts that were added in earlier iterations or by
1567     // the memory legalizer.
1568     if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1569         (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1570          Inst.getOperand(0).isReg() &&
1571          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1572       if (!OldWaitcntInstr)
1573         OldWaitcntInstr = &Inst;
1574       ++Iter;
1575       continue;
1576     }
1577 
1578     bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
1579                       isPreheaderToFlush(Block, ScoreBrackets);
1580 
1581     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1582     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1583                                           FlushVmCnt);
1584     OldWaitcntInstr = nullptr;
1585 
1586     // Restore vccz if it's not known to be correct already.
1587     bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1588 
1589     // Don't examine operands unless we need to track vccz correctness.
1590     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1591       if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1592           Inst.definesRegister(AMDGPU::VCC_HI)) {
1593         // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1594         if (!ST->partialVCCWritesUpdateVCCZ())
1595           VCCZCorrect = false;
1596       } else if (Inst.definesRegister(AMDGPU::VCC)) {
1597         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1598         // vccz bit, so when we detect that an instruction may read from a
1599         // corrupt vccz bit, we need to:
1600         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1601         //    operations to complete.
1602         // 2. Restore the correct value of vccz by writing the current value
1603         //    of vcc back to vcc.
1604         if (ST->hasReadVCCZBug() &&
1605             ScoreBrackets.getScoreLB(LGKM_CNT) <
1606                 ScoreBrackets.getScoreUB(LGKM_CNT) &&
1607             ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1608           // Writes to vcc while there's an outstanding smem read may get
1609           // clobbered as soon as any read completes.
1610           VCCZCorrect = false;
1611         } else {
1612           // Writes to vcc will fix any incorrect value in vccz.
1613           VCCZCorrect = true;
1614         }
1615       }
1616     }
1617 
1618     if (TII->isSMRD(Inst)) {
1619       for (const MachineMemOperand *Memop : Inst.memoperands()) {
1620         // No need to handle invariant loads when avoiding WAR conflicts, as
1621         // there cannot be a vector store to the same memory location.
1622         if (!Memop->isInvariant()) {
1623           const Value *Ptr = Memop->getValue();
1624           SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1625         }
1626       }
1627       if (ST->hasReadVCCZBug()) {
1628         // This smem read could complete and clobber vccz at any time.
1629         VCCZCorrect = false;
1630       }
1631     }
1632 
1633     updateEventWaitcntAfter(Inst, &ScoreBrackets);
1634 
1635 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1636     // If this instruction generates a S_SETVSKIP because it is an
1637     // indexed resource, and we are on Tahiti, then it will also force
1638     // an S_WAITCNT vmcnt(0)
1639     if (RequireCheckResourceType(Inst, context)) {
1640       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1641       ScoreBrackets->setScoreLB(VM_CNT,
1642       ScoreBrackets->getScoreUB(VM_CNT));
1643     }
1644 #endif
1645 
1646     LLVM_DEBUG({
1647       Inst.print(dbgs());
1648       ScoreBrackets.dump();
1649     });
1650 
1651     // TODO: Remove this work-around after fixing the scheduler and enable the
1652     // assert above.
1653     if (RestoreVCCZ) {
1654       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
1655       // bit is updated, so we can restore the bit by reading the value of
1656       // vcc and then writing it back to the register.
1657       BuildMI(Block, Inst, Inst.getDebugLoc(),
1658               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1659               TRI->getVCC())
1660           .addReg(TRI->getVCC());
1661       VCCZCorrect = true;
1662       Modified = true;
1663     }
1664 
1665     ++Iter;
1666   }
1667 
1668   if (Block.getFirstTerminator() == Block.end() &&
1669       isPreheaderToFlush(Block, ScoreBrackets))
1670     Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1671 
1672   return Modified;
1673 }
1674 
1675 // Return true if the given machine basic block is a preheader of a loop in
1676 // which we want to flush the vmcnt counter, and false otherwise.
1677 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
1678                                           WaitcntBrackets &ScoreBrackets) {
1679   if (PreheadersToFlush.count(&MBB))
1680     return PreheadersToFlush[&MBB];
1681 
1682   auto UpdateCache = [&](bool val) {
1683     PreheadersToFlush[&MBB] = val;
1684     return val;
1685   };
1686 
1687   MachineBasicBlock *Succ = MBB.getSingleSuccessor();
1688   if (!Succ)
1689     return UpdateCache(false);
1690 
1691   MachineLoop *Loop = MLI->getLoopFor(Succ);
1692   if (!Loop)
1693     return UpdateCache(false);
1694 
1695   if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
1696     return UpdateCache(true);
1697 
1698   return UpdateCache(false);
1699 }
1700 
1701 // Return true if it is better to flush the vmcnt counter in the preheader of
1702 // the given loop. We currently decide to flush in two situations:
1703 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
1704 //    vgpr containing a value that is loaded outside of the loop. (Only on
1705 //    targets with no vscnt counter).
1706 // 2. The loop contains vmem load(s), but the loaded values are not used in the
1707 //    loop, and at least one use of a vgpr containing a value that is loaded
1708 //    outside of the loop.
1709 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
1710                                         WaitcntBrackets &Brackets) {
1711   bool HasVMemLoad = false;
1712   bool HasVMemStore = false;
1713   bool UsesVgprLoadedOutside = false;
1714   DenseSet<Register> VgprUse;
1715   DenseSet<Register> VgprDef;
1716 
1717   for (MachineBasicBlock *MBB : ML->blocks()) {
1718     for (MachineInstr &MI : *MBB) {
1719       if (SIInstrInfo::isVMEM(MI)) {
1720         if (MI.mayLoad())
1721           HasVMemLoad = true;
1722         if (MI.mayStore())
1723           HasVMemStore = true;
1724       }
1725       for (unsigned I = 0; I < MI.getNumOperands(); I++) {
1726         MachineOperand &Op = MI.getOperand(I);
1727         if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
1728           continue;
1729         RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
1730         // Vgpr use
1731         if (Op.isUse()) {
1732           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1733             // If we find a register that is loaded inside the loop, 1. and 2.
1734             // are invalidated and we can exit.
1735             if (VgprDef.contains(RegNo))
1736               return false;
1737             VgprUse.insert(RegNo);
1738             // If at least one of Op's registers is in the score brackets, the
1739             // value is likely loaded outside of the loop.
1740             if (Brackets.getRegScore(RegNo, VM_CNT) > 0) {
1741               UsesVgprLoadedOutside = true;
1742               break;
1743             }
1744           }
1745         }
1746         // VMem load vgpr def
1747         else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
1748           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1749             // If we find a register that is loaded inside the loop, 1. and 2.
1750             // are invalidated and we can exit.
1751             if (VgprUse.contains(RegNo))
1752               return false;
1753             VgprDef.insert(RegNo);
1754           }
1755       }
1756     }
1757   }
1758   if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1759     return true;
1760   return HasVMemLoad && UsesVgprLoadedOutside;
1761 }
1762 
1763 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1764   ST = &MF.getSubtarget<GCNSubtarget>();
1765   TII = ST->getInstrInfo();
1766   TRI = &TII->getRegisterInfo();
1767   MRI = &MF.getRegInfo();
1768   IV = AMDGPU::getIsaVersion(ST->getCPU());
1769   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1770   MLI = &getAnalysis<MachineLoopInfo>();
1771   PDT = &getAnalysis<MachinePostDominatorTree>();
1772 
1773   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1774   for (auto T : inst_counter_types())
1775     ForceEmitWaitcnt[T] = false;
1776 
1777   HardwareLimits Limits = {};
1778   Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1779   Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1780   Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1781   Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
1782 
1783   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1784   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1785   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1786   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1787 
1788   RegisterEncoding Encoding = {};
1789   Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1790   Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1791   Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1792   Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1793 
1794   TrackedWaitcntSet.clear();
1795   BlockInfos.clear();
1796   bool Modified = false;
1797 
1798   if (!MFI->isEntryFunction()) {
1799     // Wait for any outstanding memory operations that the input registers may
1800     // depend on. We can't track them and it's better to do the wait after the
1801     // costly call sequence.
1802 
1803     // TODO: Could insert earlier and schedule more liberally with operations
1804     // that only use caller preserved registers.
1805     MachineBasicBlock &EntryBB = MF.front();
1806     MachineBasicBlock::iterator I = EntryBB.begin();
1807     for (MachineBasicBlock::iterator E = EntryBB.end();
1808          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1809       ;
1810     BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1811     if (ST->hasVscnt())
1812       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1813           .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1814           .addImm(0);
1815 
1816     Modified = true;
1817   }
1818 
1819   // Keep iterating over the blocks in reverse post order, inserting and
1820   // updating s_waitcnt where needed, until a fix point is reached.
1821   for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
1822     BlockInfos.insert({MBB, BlockInfo(MBB)});
1823 
1824   std::unique_ptr<WaitcntBrackets> Brackets;
1825   bool Repeat;
1826   do {
1827     Repeat = false;
1828 
1829     for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1830          ++BII) {
1831       BlockInfo &BI = BII->second;
1832       if (!BI.Dirty)
1833         continue;
1834 
1835       if (BI.Incoming) {
1836         if (!Brackets)
1837           Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1838         else
1839           *Brackets = *BI.Incoming;
1840       } else {
1841         if (!Brackets)
1842           Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1843         else
1844           *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1845       }
1846 
1847       Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1848       BI.Dirty = false;
1849 
1850       if (Brackets->hasPending()) {
1851         BlockInfo *MoveBracketsToSucc = nullptr;
1852         for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1853           auto SuccBII = BlockInfos.find(Succ);
1854           BlockInfo &SuccBI = SuccBII->second;
1855           if (!SuccBI.Incoming) {
1856             SuccBI.Dirty = true;
1857             if (SuccBII <= BII)
1858               Repeat = true;
1859             if (!MoveBracketsToSucc) {
1860               MoveBracketsToSucc = &SuccBI;
1861             } else {
1862               SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1863             }
1864           } else if (SuccBI.Incoming->merge(*Brackets)) {
1865             SuccBI.Dirty = true;
1866             if (SuccBII <= BII)
1867               Repeat = true;
1868           }
1869         }
1870         if (MoveBracketsToSucc)
1871           MoveBracketsToSucc->Incoming = std::move(Brackets);
1872       }
1873     }
1874   } while (Repeat);
1875 
1876   if (ST->hasScalarStores()) {
1877     SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1878     bool HaveScalarStores = false;
1879 
1880     for (MachineBasicBlock &MBB : MF) {
1881       for (MachineInstr &MI : MBB) {
1882         if (!HaveScalarStores && TII->isScalarStore(MI))
1883           HaveScalarStores = true;
1884 
1885         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1886             MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1887           EndPgmBlocks.push_back(&MBB);
1888       }
1889     }
1890 
1891     if (HaveScalarStores) {
1892       // If scalar writes are used, the cache must be flushed or else the next
1893       // wave to reuse the same scratch memory can be clobbered.
1894       //
1895       // Insert s_dcache_wb at wave termination points if there were any scalar
1896       // stores, and only if the cache hasn't already been flushed. This could
1897       // be improved by looking across blocks for flushes in postdominating
1898       // blocks from the stores but an explicitly requested flush is probably
1899       // very rare.
1900       for (MachineBasicBlock *MBB : EndPgmBlocks) {
1901         bool SeenDCacheWB = false;
1902 
1903         for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
1904              I != E; ++I) {
1905           if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1906             SeenDCacheWB = true;
1907           else if (TII->isScalarStore(*I))
1908             SeenDCacheWB = false;
1909 
1910           // FIXME: It would be better to insert this before a waitcnt if any.
1911           if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1912                I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1913               !SeenDCacheWB) {
1914             Modified = true;
1915             BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1916           }
1917         }
1918       }
1919     }
1920   }
1921 
1922   return Modified;
1923 }
1924