xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/ScheduleDAG.h"
20 #include "llvm/TargetParser/TargetParser.h"
21 
22 using namespace llvm;
23 
24 namespace {
25 
26 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28 
29   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30     if (Arg.getAsInteger(0, Value))
31       return O.error("'" + Arg + "' value invalid for uint argument!");
32 
33     if (Value > 100)
34       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35 
36     return false;
37   }
38 };
39 
40 } // end anonymous namespace
41 
42 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
43     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44                      cl::desc("Fill a percentage of the latency between "
45                               "neighboring MFMA with s_nops."));
46 
47 // This is intended for debugging purposes only.
48 static cl::opt<unsigned>
49     NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
50                cl::desc("Insert a s_nop x before every instruction"));
51 
52 //===----------------------------------------------------------------------===//
53 // Hazard Recognizer Implementation
54 //===----------------------------------------------------------------------===//
55 
56 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57                                                  const GCNSubtarget &ST);
58 
59 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60     : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61       ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62       TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63       ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66 }
67 
68 void GCNHazardRecognizer::Reset() {
69   EmittedInstrs.clear();
70 }
71 
72 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
73   EmitInstruction(SU->getInstr());
74 }
75 
76 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
77   CurrCycleInstr = MI;
78 }
79 
80 static bool isDivFMas(unsigned Opcode) {
81   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82 }
83 
84 static bool isSGetReg(unsigned Opcode) {
85   return Opcode == AMDGPU::S_GETREG_B32;
86 }
87 
88 static bool isSSetReg(unsigned Opcode) {
89   switch (Opcode) {
90   case AMDGPU::S_SETREG_B32:
91   case AMDGPU::S_SETREG_B32_mode:
92   case AMDGPU::S_SETREG_IMM32_B32:
93   case AMDGPU::S_SETREG_IMM32_B32_mode:
94     return true;
95   }
96   return false;
97 }
98 
99 static bool isRWLane(unsigned Opcode) {
100   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101 }
102 
103 static bool isRFE(unsigned Opcode) {
104   return Opcode == AMDGPU::S_RFE_B64;
105 }
106 
107 static bool isSMovRel(unsigned Opcode) {
108   switch (Opcode) {
109   case AMDGPU::S_MOVRELS_B32:
110   case AMDGPU::S_MOVRELS_B64:
111   case AMDGPU::S_MOVRELD_B32:
112   case AMDGPU::S_MOVRELD_B64:
113     return true;
114   default:
115     return false;
116   }
117 }
118 
119 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
120                                     const MachineInstr &MI) {
121   if (TII.isAlwaysGDS(MI.getOpcode()))
122     return true;
123 
124   switch (MI.getOpcode()) {
125   case AMDGPU::S_SENDMSG:
126   case AMDGPU::S_SENDMSGHALT:
127   case AMDGPU::S_TTRACEDATA:
128     return true;
129   // These DS opcodes don't support GDS.
130   case AMDGPU::DS_NOP:
131   case AMDGPU::DS_PERMUTE_B32:
132   case AMDGPU::DS_BPERMUTE_B32:
133     return false;
134   default:
135     if (TII.isDS(MI.getOpcode())) {
136       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
137                                            AMDGPU::OpName::gds);
138       if (MI.getOperand(GDS).getImm())
139         return true;
140     }
141     return false;
142   }
143 }
144 
145 static bool isPermlane(const MachineInstr &MI) {
146   unsigned Opcode = MI.getOpcode();
147   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148          Opcode == AMDGPU::V_PERMLANE64_B32 ||
149          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153          Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155          Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
156 }
157 
158 static bool isLdsDma(const MachineInstr &MI) {
159   return SIInstrInfo::isVALU(MI) &&
160          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
161 }
162 
163 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
164   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
165                                                      AMDGPU::OpName::simm16);
166   return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
167 }
168 
169 ScheduleHazardRecognizer::HazardType
170 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
171   MachineInstr *MI = SU->getInstr();
172   // If we are not in "HazardRecognizerMode" and therefore not being run from
173   // the scheduler, track possible stalls from hazards but don't insert noops.
174   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
175 
176   if (MI->isBundle())
177    return NoHazard;
178 
179   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
180     return HazardType;
181 
182   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
183     return HazardType;
184 
185   if (checkFPAtomicToDenormModeHazard(MI) > 0)
186     return HazardType;
187 
188   if (ST.hasNoDataDepHazard())
189     return NoHazard;
190 
191   if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
192     return HazardType;
193 
194   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
195     return HazardType;
196 
197   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
198     return HazardType;
199 
200   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
201     return HazardType;
202 
203   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
204     return HazardType;
205 
206   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
207        SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) &&
208       checkMAIVALUHazards(MI) > 0)
209     return HazardType;
210 
211   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
212     return HazardType;
213 
214   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
215     return HazardType;
216 
217   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
218     return HazardType;
219 
220   if (((ST.hasReadM0MovRelInterpHazard() &&
221         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
222          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
223          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
224        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
225        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
226        (ST.hasReadM0LdsDirectHazard() &&
227         MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
228       checkReadM0Hazards(MI) > 0)
229     return HazardType;
230 
231   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
232     return HazardType;
233 
234   if ((SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI)) &&
235       checkMAILdStHazards(MI) > 0)
236     return HazardType;
237 
238   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
239     return HazardType;
240 
241   return NoHazard;
242 }
243 
244 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
245                                 unsigned Quantity) {
246   while (Quantity > 0) {
247     unsigned Arg = std::min(Quantity, 8u);
248     Quantity -= Arg;
249     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
250         .addImm(Arg - 1);
251   }
252 }
253 
254 unsigned
255 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
256   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
257   assert(TSchedModel.getWriteProcResBegin(SC) !=
258          TSchedModel.getWriteProcResEnd(SC));
259   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
260 }
261 
262 void GCNHazardRecognizer::processBundle() {
263   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
264   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
265   // Check bundled MachineInstr's for hazards.
266   for (; MI != E && MI->isInsideBundle(); ++MI) {
267     CurrCycleInstr = &*MI;
268     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
269 
270     if (IsHazardRecognizerMode) {
271       fixHazards(CurrCycleInstr);
272 
273       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
274     }
275 
276     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
277     // include the bundled MI directly after, only add a maximum of
278     // (MaxLookAhead - 1) noops to EmittedInstrs.
279     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
280       EmittedInstrs.push_front(nullptr);
281 
282     EmittedInstrs.push_front(CurrCycleInstr);
283     EmittedInstrs.resize(MaxLookAhead);
284   }
285   CurrCycleInstr = nullptr;
286 }
287 
288 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
289   assert(IsHazardRecognizerMode);
290 
291   unsigned NumPreNoops = PreEmitNoops(MI);
292   EmitNoops(NumPreNoops);
293   if (MI->isInsideBundle())
294     insertNoopsInBundle(MI, TII, NumPreNoops);
295   else
296     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
297                     NumPreNoops);
298   EmitInstruction(MI);
299   AdvanceCycle();
300 }
301 
302 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
303   IsHazardRecognizerMode = true;
304   CurrCycleInstr = MI;
305   unsigned W = PreEmitNoopsCommon(MI);
306   fixHazards(MI);
307   CurrCycleInstr = nullptr;
308   return std::max(W, NopPadding.getValue());
309 }
310 
311 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
312   if (MI->isBundle())
313     return 0;
314 
315   int WaitStates = 0;
316 
317   if (SIInstrInfo::isSMRD(*MI))
318     return std::max(WaitStates, checkSMRDHazards(MI));
319 
320   if (ST.hasNSAtoVMEMBug())
321     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
322 
323   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
324 
325   if (ST.hasNoDataDepHazard())
326     return WaitStates;
327 
328   if (SIInstrInfo::isVMEM(*MI))
329     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
330 
331   if (SIInstrInfo::isVALU(*MI))
332     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
333 
334   if (SIInstrInfo::isDPP(*MI))
335     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
336 
337   if (isDivFMas(MI->getOpcode()))
338     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
339 
340   if (isRWLane(MI->getOpcode()))
341     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
342 
343   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
344        SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) &&
345       checkMAIVALUHazards(MI) > 0)
346     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
347 
348   if (MI->isInlineAsm())
349     return std::max(WaitStates, checkInlineAsmHazards(MI));
350 
351   if (isSGetReg(MI->getOpcode()))
352     return std::max(WaitStates, checkGetRegHazards(MI));
353 
354   if (isSSetReg(MI->getOpcode()))
355     return std::max(WaitStates, checkSetRegHazards(MI));
356 
357   if (isRFE(MI->getOpcode()))
358     return std::max(WaitStates, checkRFEHazards(MI));
359 
360   if ((ST.hasReadM0MovRelInterpHazard() &&
361        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
362         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
363         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
364       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
365       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
366       (ST.hasReadM0LdsDirectHazard() &&
367        MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
368     return std::max(WaitStates, checkReadM0Hazards(MI));
369 
370   if (SIInstrInfo::isMAI(*MI))
371     return std::max(WaitStates, checkMAIHazards(MI));
372 
373   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI))
374     return std::max(WaitStates, checkMAILdStHazards(MI));
375 
376   if (ST.hasGFX950Insts() && isPermlane(*MI))
377     return std::max(WaitStates, checkPermlaneHazards(MI));
378 
379   return WaitStates;
380 }
381 
382 void GCNHazardRecognizer::EmitNoop() {
383   EmittedInstrs.push_front(nullptr);
384 }
385 
386 void GCNHazardRecognizer::AdvanceCycle() {
387   // When the scheduler detects a stall, it will call AdvanceCycle() without
388   // emitting any instructions.
389   if (!CurrCycleInstr) {
390     EmittedInstrs.push_front(nullptr);
391     return;
392   }
393 
394   if (CurrCycleInstr->isBundle()) {
395     processBundle();
396     return;
397   }
398 
399   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
400   if (!NumWaitStates) {
401     CurrCycleInstr = nullptr;
402     return;
403   }
404 
405   // Keep track of emitted instructions
406   EmittedInstrs.push_front(CurrCycleInstr);
407 
408   // Add a nullptr for each additional wait state after the first.  Make sure
409   // not to add more than getMaxLookAhead() items to the list, since we
410   // truncate the list to that size right after this loop.
411   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
412        i < e; ++i) {
413     EmittedInstrs.push_front(nullptr);
414   }
415 
416   // getMaxLookahead() is the largest number of wait states we will ever need
417   // to insert, so there is no point in keeping track of more than that many
418   // wait states.
419   EmittedInstrs.resize(getMaxLookAhead());
420 
421   CurrCycleInstr = nullptr;
422 }
423 
424 void GCNHazardRecognizer::RecedeCycle() {
425   assert(!IsHazardRecognizerMode &&
426          "Bottom-up scheduling shouldn't run in hazard recognizer mode");
427 }
428 
429 //===----------------------------------------------------------------------===//
430 // Helper Functions
431 //===----------------------------------------------------------------------===//
432 
433 using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
434 
435 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
436 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
437 
438 // Search for a hazard in a block and its predecessors.
439 template <typename StateT>
440 static bool
441 hasHazard(StateT State,
442           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
443           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
444           const MachineBasicBlock *MBB,
445           MachineBasicBlock::const_reverse_instr_iterator I,
446           DenseSet<const MachineBasicBlock *> &Visited) {
447   for (auto E = MBB->instr_rend(); I != E; ++I) {
448     // No need to look at parent BUNDLE instructions.
449     if (I->isBundle())
450       continue;
451 
452     switch (IsHazard(State, *I)) {
453     case HazardFound:
454       return true;
455     case HazardExpired:
456       return false;
457     default:
458       // Continue search
459       break;
460     }
461 
462     if (I->isInlineAsm() || I->isMetaInstruction())
463       continue;
464 
465     UpdateState(State, *I);
466   }
467 
468   for (MachineBasicBlock *Pred : MBB->predecessors()) {
469     if (!Visited.insert(Pred).second)
470       continue;
471 
472     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
473                   Visited))
474       return true;
475   }
476 
477   return false;
478 }
479 
480 // Returns a minimum wait states since \p I walking all predecessors.
481 // Only scans until \p IsExpired does not return true.
482 // Can only be run in a hazard recognizer mode.
483 static int getWaitStatesSince(
484     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
485     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
486     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
487     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
488   for (auto E = MBB->instr_rend(); I != E; ++I) {
489     // Don't add WaitStates for parent BUNDLE instructions.
490     if (I->isBundle())
491       continue;
492 
493     if (IsHazard(*I))
494       return WaitStates;
495 
496     if (I->isInlineAsm())
497       continue;
498 
499     WaitStates += GetNumWaitStates(*I);
500 
501     if (IsExpired(*I, WaitStates))
502       return std::numeric_limits<int>::max();
503   }
504 
505   int MinWaitStates = std::numeric_limits<int>::max();
506   for (MachineBasicBlock *Pred : MBB->predecessors()) {
507     if (!Visited.insert(Pred).second)
508       continue;
509 
510     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
511                                IsExpired, Visited, GetNumWaitStates);
512 
513     MinWaitStates = std::min(MinWaitStates, W);
514   }
515 
516   return MinWaitStates;
517 }
518 
519 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
520                               const MachineInstr *MI, IsExpiredFn IsExpired) {
521   DenseSet<const MachineBasicBlock *> Visited;
522   return getWaitStatesSince(IsHazard, MI->getParent(),
523                             std::next(MI->getReverseIterator()),
524                             0, IsExpired, Visited);
525 }
526 
527 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
528   if (IsHazardRecognizerMode) {
529     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
530       return WaitStates >= Limit;
531     };
532     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
533   }
534 
535   int WaitStates = 0;
536   for (MachineInstr *MI : EmittedInstrs) {
537     if (MI) {
538       if (IsHazard(*MI))
539         return WaitStates;
540 
541       if (MI->isInlineAsm())
542         continue;
543     }
544     ++WaitStates;
545 
546     if (WaitStates >= Limit)
547       break;
548   }
549   return std::numeric_limits<int>::max();
550 }
551 
552 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
553                                                IsHazardFn IsHazardDef,
554                                                int Limit) {
555   const SIRegisterInfo *TRI = ST.getRegisterInfo();
556 
557   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
558     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
559   };
560 
561   return getWaitStatesSince(IsHazardFn, Limit);
562 }
563 
564 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
565                                                   int Limit) {
566   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
567     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
568   };
569 
570   return getWaitStatesSince(IsHazardFn, Limit);
571 }
572 
573 //===----------------------------------------------------------------------===//
574 // No-op Hazard Detection
575 //===----------------------------------------------------------------------===//
576 
577 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
578                         MCRegister Reg) {
579   for (MCRegUnit Unit : TRI.regunits(Reg))
580     BV.set(Unit);
581 }
582 
583 static void addRegsToSet(const SIRegisterInfo &TRI,
584                          iterator_range<MachineInstr::const_mop_iterator> Ops,
585                          BitVector &DefSet, BitVector &UseSet) {
586   for (const MachineOperand &Op : Ops) {
587     if (Op.isReg())
588       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
589   }
590 }
591 
592 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
593   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
594 }
595 
596 static bool breaksSMEMSoftClause(MachineInstr *MI) {
597   return !SIInstrInfo::isSMRD(*MI);
598 }
599 
600 static bool breaksVMEMSoftClause(MachineInstr *MI) {
601   return !SIInstrInfo::isVMEM(*MI);
602 }
603 
604 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
605   // SMEM soft clause are only present on VI+, and only matter if xnack is
606   // enabled.
607   if (!ST.isXNACKEnabled())
608     return 0;
609 
610   bool IsSMRD = TII.isSMRD(*MEM);
611 
612   resetClause();
613 
614   // A soft-clause is any group of consecutive SMEM instructions.  The
615   // instructions in this group may return out of order and/or may be
616   // replayed (i.e. the same instruction issued more than once).
617   //
618   // In order to handle these situations correctly we need to make sure that
619   // when a clause has more than one instruction, no instruction in the clause
620   // writes to a register that is read by another instruction in the clause
621   // (including itself). If we encounter this situation, we need to break the
622   // clause by inserting a non SMEM instruction.
623 
624   for (MachineInstr *MI : EmittedInstrs) {
625     // When we hit a non-SMEM instruction then we have passed the start of the
626     // clause and we can stop.
627     if (!MI)
628       break;
629 
630     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
631       break;
632 
633     addClauseInst(*MI);
634   }
635 
636   if (ClauseDefs.none())
637     return 0;
638 
639   // We need to make sure not to put loads and stores in the same clause if they
640   // use the same address. For now, just start a new clause whenever we see a
641   // store.
642   if (MEM->mayStore())
643     return 1;
644 
645   addClauseInst(*MEM);
646 
647   // If the set of defs and uses intersect then we cannot add this instruction
648   // to the clause, so we have a hazard.
649   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
650 }
651 
652 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
653   int WaitStatesNeeded = 0;
654 
655   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
656 
657   // This SMRD hazard only affects SI.
658   if (!ST.hasSMRDReadVALUDefHazard())
659     return WaitStatesNeeded;
660 
661   // A read of an SGPR by SMRD instruction requires 4 wait states when the
662   // SGPR was written by a VALU instruction.
663   int SmrdSgprWaitStates = 4;
664   auto IsHazardDefFn = [this](const MachineInstr &MI) {
665     return TII.isVALU(MI);
666   };
667   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
668     return TII.isSALU(MI);
669   };
670 
671   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
672 
673   for (const MachineOperand &Use : SMRD->uses()) {
674     if (!Use.isReg())
675       continue;
676     int WaitStatesNeededForUse =
677         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
678                                                    SmrdSgprWaitStates);
679     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
680 
681     // This fixes what appears to be undocumented hardware behavior in SI where
682     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
683     // needs some number of nops in between. We don't know how many we need, but
684     // let's use 4. This wasn't discovered before probably because the only
685     // case when this happens is when we expand a 64-bit pointer into a full
686     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
687     // probably never encountered in the closed-source land.
688     if (IsBufferSMRD) {
689       int WaitStatesNeededForUse =
690         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
691                                                    IsBufferHazardDefFn,
692                                                    SmrdSgprWaitStates);
693       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
694     }
695   }
696 
697   return WaitStatesNeeded;
698 }
699 
700 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
701   if (!ST.hasVMEMReadSGPRVALUDefHazard())
702     return 0;
703 
704   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
705 
706   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
707   // SGPR was written by a VALU Instruction.
708   const int VmemSgprWaitStates = 5;
709   auto IsHazardDefFn = [this](const MachineInstr &MI) {
710     return TII.isVALU(MI);
711   };
712   for (const MachineOperand &Use : VMEM->uses()) {
713     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
714       continue;
715 
716     int WaitStatesNeededForUse =
717         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
718                                                    VmemSgprWaitStates);
719     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
720   }
721   return WaitStatesNeeded;
722 }
723 
724 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
725   const SIRegisterInfo *TRI = ST.getRegisterInfo();
726   const SIInstrInfo *TII = ST.getInstrInfo();
727 
728   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
729   int DppVgprWaitStates = 2;
730   int DppExecWaitStates = 5;
731   int WaitStatesNeeded = 0;
732   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
733     return TII->isVALU(MI);
734   };
735 
736   for (const MachineOperand &Use : DPP->uses()) {
737     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
738       continue;
739     int WaitStatesNeededForUse =
740         DppVgprWaitStates - getWaitStatesSinceDef(
741                                 Use.getReg(),
742                                 [](const MachineInstr &) { return true; },
743                                 DppVgprWaitStates);
744     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745   }
746 
747   WaitStatesNeeded = std::max(
748       WaitStatesNeeded,
749       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
750                                                 DppExecWaitStates));
751 
752   return WaitStatesNeeded;
753 }
754 
755 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
756   const SIInstrInfo *TII = ST.getInstrInfo();
757 
758   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
759   // instruction.
760   const int DivFMasWaitStates = 4;
761   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
762     return TII->isVALU(MI);
763   };
764   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
765                                                DivFMasWaitStates);
766 
767   return DivFMasWaitStates - WaitStatesNeeded;
768 }
769 
770 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
771   const SIInstrInfo *TII = ST.getInstrInfo();
772   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
773 
774   const int GetRegWaitStates = 2;
775   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
776     return GetRegHWReg == getHWReg(TII, MI);
777   };
778   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
779 
780   return GetRegWaitStates - WaitStatesNeeded;
781 }
782 
783 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
784   const SIInstrInfo *TII = ST.getInstrInfo();
785   unsigned HWReg = getHWReg(TII, *SetRegInstr);
786 
787   const int SetRegWaitStates = ST.getSetRegWaitStates();
788   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
789     return HWReg == getHWReg(TII, MI);
790   };
791   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
792   return SetRegWaitStates - WaitStatesNeeded;
793 }
794 
795 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
796   if (!MI.mayStore())
797     return -1;
798 
799   const SIInstrInfo *TII = ST.getInstrInfo();
800   unsigned Opcode = MI.getOpcode();
801   const MCInstrDesc &Desc = MI.getDesc();
802 
803   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
804   int VDataRCID = -1;
805   if (VDataIdx != -1)
806     VDataRCID = Desc.operands()[VDataIdx].RegClass;
807 
808   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
809     // There is no hazard if the instruction does not use vector regs
810     // (like wbinvl1)
811     if (VDataIdx == -1)
812       return -1;
813     // For MUBUF/MTBUF instructions this hazard only exists if the
814     // instruction is not using a register in the soffset field.
815     const MachineOperand *SOffset =
816         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
817     // If we have no soffset operand, then assume this field has been
818     // hardcoded to zero.
819     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
820         (!SOffset || !SOffset->isReg()))
821       return VDataIdx;
822   }
823 
824   // MIMG instructions create a hazard if they don't use a 256-bit T# and
825   // the store size is greater than 8 bytes and they have more than two bits
826   // of their dmask set.
827   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
828   if (TII->isMIMG(MI)) {
829     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
830     assert(SRsrcIdx != -1 &&
831            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
832     (void)SRsrcIdx;
833   }
834 
835   if (TII->isFLAT(MI)) {
836     // There is no hazard if the instruction does not use vector regs
837     if (VDataIdx == -1)
838       return -1;
839 
840     if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
841       return VDataIdx;
842   }
843 
844   return -1;
845 }
846 
847 int
848 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
849                                             const MachineRegisterInfo &MRI) {
850   // Helper to check for the hazard where VMEM instructions that store more than
851   // 8 bytes can have there store data over written by the next instruction.
852   const SIRegisterInfo *TRI = ST.getRegisterInfo();
853 
854   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
855   int WaitStatesNeeded = 0;
856 
857   if (!TRI->isVectorRegister(MRI, Def.getReg()))
858     return WaitStatesNeeded;
859   Register Reg = Def.getReg();
860   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
861     int DataIdx = createsVALUHazard(MI);
862     return DataIdx >= 0 &&
863            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
864   };
865 
866   int WaitStatesNeededForDef =
867     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
868   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
869 
870   return WaitStatesNeeded;
871 }
872 
873 /// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
874 /// pack the computed value into correct bit position of the dest register. This
875 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
876 /// dst_sel that is not aligned to the register. This function analayzes the \p
877 /// MI and \returns an operand with dst forwarding issue, or nullptr if
878 /// none exists.
879 static const MachineOperand *
880 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
881   if (!SIInstrInfo::isVALU(MI))
882     return nullptr;
883 
884   const SIInstrInfo *TII = ST.getInstrInfo();
885 
886   unsigned Opcode = MI.getOpcode();
887 
888   // There are three different types of instructions
889   // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
890   // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
891   // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
892   // op_sel[3:2]
893   // != 0
894   if (SIInstrInfo::isSDWA(MI)) {
895     // Type 1: SDWA with dst_sel != DWORD
896     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
897       if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
898         return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
899   }
900 
901   AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
902   if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
903     // Type 2: VOP3 which write the hi bits
904     if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
905         SISrcMods::DST_OP_SEL)
906       return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
907 
908     // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
909     if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
910         (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
911          SISrcMods::OP_SEL_0))
912       return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
913   }
914 
915   // Special case: nop is required for all the opsel values for fp4 sr variant
916   // cvt scale instructions
917   if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
918     return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
919 
920   return nullptr;
921 }
922 
923 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
924 /// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
925 /// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
926 static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
927                                             const MachineOperand *Dst,
928                                             const SIRegisterInfo *TRI) {
929   // We must consider implicit reads of the VALU. SDWA with dst_sel and
930   // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
931   // and we must account for that hazard.
932   // We also must account for WAW hazards. In particular, WAW with dest
933   // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
934   // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
935   // check for ECC. Without accounting for this hazard, the ECC will be
936   // wrong.
937   // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
938   // complete zeroesHigh16BitsOfDest)
939   for (auto &Operand : VALU->operands()) {
940     if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
941       return true;
942     }
943   }
944   return false;
945 }
946 
947 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
948   int WaitStatesNeeded = 0;
949 
950   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
951     const int TransDefWaitstates = 1;
952 
953     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
954       if (!SIInstrInfo::isTRANS(MI))
955         return false;
956       const SIRegisterInfo *TRI = ST.getRegisterInfo();
957       const SIInstrInfo *TII = ST.getInstrInfo();
958       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
959 
960       for (const MachineOperand &Use : VALU->explicit_uses()) {
961         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
962           return true;
963       }
964 
965       return false;
966     };
967 
968     int WaitStatesNeededForDef =
969         TransDefWaitstates -
970         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
971     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
972   }
973 
974   if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
975     const int Shift16DefWaitstates = 1;
976 
977     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
978       const SIRegisterInfo *TRI = ST.getRegisterInfo();
979       const MachineOperand *ForwardedDst =
980           getDstSelForwardingOperand(ProducerMI, ST);
981       if (ForwardedDst) {
982         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
983       }
984 
985       if (ProducerMI.isInlineAsm()) {
986         // Assume inline asm has dst forwarding hazard
987         for (auto &Def : ProducerMI.all_defs()) {
988           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
989             return true;
990         }
991       }
992 
993       return false;
994     };
995 
996     int WaitStatesNeededForDef =
997         Shift16DefWaitstates -
998         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
999     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1000   }
1001 
1002   if (ST.hasVDecCoExecHazard()) {
1003     const int VALUWriteSGPRVALUReadWaitstates = 2;
1004     const int VALUWriteEXECRWLane = 4;
1005     const int VALUWriteVGPRReadlaneRead = 1;
1006 
1007     const SIRegisterInfo *TRI = ST.getRegisterInfo();
1008     const MachineRegisterInfo &MRI = MF.getRegInfo();
1009     Register UseReg;
1010     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1011       if (!SIInstrInfo::isVALU(MI))
1012         return false;
1013       return MI.modifiesRegister(UseReg, TRI);
1014     };
1015 
1016     for (const MachineOperand &Use : VALU->explicit_uses()) {
1017       if (!Use.isReg())
1018         continue;
1019 
1020       UseReg = Use.getReg();
1021       if (TRI->isSGPRReg(MRI, UseReg)) {
1022         int WaitStatesNeededForDef =
1023             VALUWriteSGPRVALUReadWaitstates -
1024             getWaitStatesSince(IsVALUDefSGPRFn,
1025                                VALUWriteSGPRVALUReadWaitstates);
1026         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1027       }
1028     }
1029 
1030     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1031       UseReg = AMDGPU::VCC;
1032       int WaitStatesNeededForDef =
1033           VALUWriteSGPRVALUReadWaitstates -
1034           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1035       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1036     }
1037 
1038     switch (VALU->getOpcode()) {
1039     case AMDGPU::V_READLANE_B32:
1040     case AMDGPU::V_READFIRSTLANE_B32: {
1041       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1042       UseReg = Src->getReg();
1043       int WaitStatesNeededForDef =
1044           VALUWriteVGPRReadlaneRead -
1045           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1046       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1047     }
1048       [[fallthrough]];
1049     case AMDGPU::V_WRITELANE_B32: {
1050       UseReg = AMDGPU::EXEC;
1051       int WaitStatesNeededForDef =
1052           VALUWriteEXECRWLane -
1053           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1054       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1055       break;
1056     }
1057     default:
1058       break;
1059     }
1060   }
1061 
1062   // This checks for the hazard where VMEM instructions that store more than
1063   // 8 bytes can have there store data over written by the next instruction.
1064   if (!ST.has12DWordStoreHazard())
1065     return WaitStatesNeeded;
1066 
1067   const MachineRegisterInfo &MRI = MF.getRegInfo();
1068 
1069   for (const MachineOperand &Def : VALU->defs()) {
1070     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1071   }
1072 
1073   return WaitStatesNeeded;
1074 }
1075 
1076 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1077   // This checks for hazards associated with inline asm statements.
1078   // Since inline asms can contain just about anything, we use this
1079   // to call/leverage other check*Hazard routines. Note that
1080   // this function doesn't attempt to address all possible inline asm
1081   // hazards (good luck), but is a collection of what has been
1082   // problematic thus far.
1083 
1084   // see checkVALUHazards()
1085   if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1086       !ST.hasCvtScaleForwardingHazard())
1087     return 0;
1088 
1089   const MachineRegisterInfo &MRI = MF.getRegInfo();
1090   int WaitStatesNeeded = 0;
1091 
1092   for (const MachineOperand &Op :
1093        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1094     if (Op.isReg() && Op.isDef()) {
1095       if (!TRI.isVectorRegister(MRI, Op.getReg()))
1096         continue;
1097 
1098       if (ST.has12DWordStoreHazard()) {
1099         WaitStatesNeeded =
1100             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1101       }
1102     }
1103   }
1104 
1105   if (ST.hasDstSelForwardingHazard()) {
1106     const int Shift16DefWaitstates = 1;
1107 
1108     auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1109       const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1110       // Assume inline asm reads the dst
1111       if (Dst)
1112         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1113                IA->readsRegister(Dst->getReg(), &TRI);
1114 
1115       if (ProducerMI.isInlineAsm()) {
1116         // If MI is inline asm, assume it has dst forwarding hazard
1117         for (auto &Def : ProducerMI.all_defs()) {
1118           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1119               IA->readsRegister(Def.getReg(), &TRI)) {
1120             return true;
1121           }
1122         }
1123       }
1124 
1125       return false;
1126     };
1127 
1128     int WaitStatesNeededForDef =
1129         Shift16DefWaitstates -
1130         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1131     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1132   }
1133 
1134   return WaitStatesNeeded;
1135 }
1136 
1137 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1138   const SIInstrInfo *TII = ST.getInstrInfo();
1139   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1140   const MachineRegisterInfo &MRI = MF.getRegInfo();
1141 
1142   const MachineOperand *LaneSelectOp =
1143       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1144 
1145   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1146     return 0;
1147 
1148   Register LaneSelectReg = LaneSelectOp->getReg();
1149   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1150 
1151   const int RWLaneWaitStates = 4;
1152   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1153                                               RWLaneWaitStates);
1154   return RWLaneWaitStates - WaitStatesSince;
1155 }
1156 
1157 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1158   if (!ST.hasRFEHazards())
1159     return 0;
1160 
1161   const SIInstrInfo *TII = ST.getInstrInfo();
1162 
1163   const int RFEWaitStates = 1;
1164 
1165   auto IsHazardFn = [TII](const MachineInstr &MI) {
1166     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1167   };
1168   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1169   return RFEWaitStates - WaitStatesNeeded;
1170 }
1171 
1172 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1173   const SIInstrInfo *TII = ST.getInstrInfo();
1174   const int ReadM0WaitStates = 1;
1175   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1176   return ReadM0WaitStates -
1177          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1178 }
1179 
1180 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1181   fixVMEMtoScalarWriteHazards(MI);
1182   fixVcmpxPermlaneHazards(MI);
1183   fixSMEMtoVectorWriteHazards(MI);
1184   fixVcmpxExecWARHazard(MI);
1185   fixLdsBranchVmemWARHazard(MI);
1186   if (ST.hasLdsDirect()) {
1187     fixLdsDirectVALUHazard(MI);
1188     fixLdsDirectVMEMHazard(MI);
1189   }
1190   fixVALUPartialForwardingHazard(MI);
1191   fixVALUTransUseHazard(MI);
1192   fixWMMAHazards(MI);
1193   fixShift64HighRegBug(MI);
1194   fixVALUMaskWriteHazard(MI);
1195   fixRequiredExportPriority(MI);
1196 }
1197 
1198 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1199                               const MachineInstr &MI) {
1200   return (TII.isVOPC(MI) ||
1201           (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1202          MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1203 }
1204 
1205 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1206   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1207     return false;
1208 
1209   const SIInstrInfo *TII = ST.getInstrInfo();
1210   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1211   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1212     return isVCmpXWritesExec(*TII, *TRI, MI);
1213   };
1214 
1215   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1216     unsigned Opc = MI.getOpcode();
1217     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1218            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1219   };
1220 
1221   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1222       std::numeric_limits<int>::max())
1223     return false;
1224 
1225   // V_NOP will be discarded by SQ.
1226   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1227   // which is always a VGPR and available.
1228   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1229   Register Reg = Src0->getReg();
1230   bool IsUndef = Src0->isUndef();
1231   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1232           TII->get(AMDGPU::V_MOV_B32_e32))
1233     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1234     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1235 
1236   return true;
1237 }
1238 
1239 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1240   if (!ST.hasVMEMtoScalarWriteHazard())
1241     return false;
1242   assert(!ST.hasExtendedWaitCounts());
1243 
1244   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1245     return false;
1246 
1247   if (MI->getNumDefs() == 0)
1248     return false;
1249 
1250   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1251 
1252   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1253     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I))
1254       return false;
1255 
1256     for (const MachineOperand &Def : MI->defs()) {
1257       const MachineOperand *Op =
1258           I.findRegisterUseOperand(Def.getReg(), TRI, false);
1259       if (!Op)
1260         continue;
1261       return true;
1262     }
1263     return false;
1264   };
1265 
1266   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1267     return SIInstrInfo::isVALU(MI) ||
1268            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1269             !MI.getOperand(0).getImm()) ||
1270            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1271             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1272   };
1273 
1274   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1275       std::numeric_limits<int>::max())
1276     return false;
1277 
1278   const SIInstrInfo *TII = ST.getInstrInfo();
1279   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1280           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1281       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1282   return true;
1283 }
1284 
1285 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1286   if (!ST.hasSMEMtoVectorWriteHazard())
1287     return false;
1288   assert(!ST.hasExtendedWaitCounts());
1289 
1290   if (!SIInstrInfo::isVALU(*MI))
1291     return false;
1292 
1293   AMDGPU::OpName SDSTName;
1294   switch (MI->getOpcode()) {
1295   case AMDGPU::V_READLANE_B32:
1296   case AMDGPU::V_READFIRSTLANE_B32:
1297     SDSTName = AMDGPU::OpName::vdst;
1298     break;
1299   default:
1300     SDSTName = AMDGPU::OpName::sdst;
1301     break;
1302   }
1303 
1304   const SIInstrInfo *TII = ST.getInstrInfo();
1305   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1306   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1307   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1308   if (!SDST) {
1309     for (const auto &MO : MI->implicit_operands()) {
1310       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1311         SDST = &MO;
1312         break;
1313       }
1314     }
1315   }
1316 
1317   if (!SDST)
1318     return false;
1319 
1320   const Register SDSTReg = SDST->getReg();
1321   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1322     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1323   };
1324 
1325   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1326     if (TII->isSALU(MI)) {
1327       switch (MI.getOpcode()) {
1328       case AMDGPU::S_SETVSKIP:
1329       case AMDGPU::S_VERSION:
1330       case AMDGPU::S_WAITCNT_VSCNT:
1331       case AMDGPU::S_WAITCNT_VMCNT:
1332       case AMDGPU::S_WAITCNT_EXPCNT:
1333         // These instructions cannot not mitigate the hazard.
1334         return false;
1335       case AMDGPU::S_WAITCNT_LGKMCNT:
1336         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1337         return (MI.getOperand(1).getImm() == 0) &&
1338                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1339       case AMDGPU::S_WAITCNT: {
1340         const int64_t Imm = MI.getOperand(0).getImm();
1341         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1342         // DsCnt corresponds to LGKMCnt here.
1343         return (Decoded.DsCnt == 0);
1344       }
1345       default:
1346         // SOPP instructions cannot mitigate the hazard.
1347         if (TII->isSOPP(MI))
1348           return false;
1349         // At this point the SALU can be assumed to mitigate the hazard
1350         // because either:
1351         // (a) it is independent of the at risk SMEM (breaking chain),
1352         // or
1353         // (b) it is dependent on the SMEM, in which case an appropriate
1354         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1355         //     SMEM instruction.
1356         return true;
1357       }
1358     }
1359     return false;
1360   };
1361 
1362   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1363       std::numeric_limits<int>::max())
1364     return false;
1365 
1366   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1367           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1368       .addImm(0);
1369   return true;
1370 }
1371 
1372 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1373   if (!ST.hasVcmpxExecWARHazard())
1374     return false;
1375   assert(!ST.hasExtendedWaitCounts());
1376 
1377   if (!SIInstrInfo::isVALU(*MI))
1378     return false;
1379 
1380   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1381   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1382     return false;
1383 
1384   auto IsHazardFn = [TRI](const MachineInstr &I) {
1385     if (SIInstrInfo::isVALU(I))
1386       return false;
1387     return I.readsRegister(AMDGPU::EXEC, TRI);
1388   };
1389 
1390   const SIInstrInfo *TII = ST.getInstrInfo();
1391   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1392     if (SIInstrInfo::isVALU(MI)) {
1393       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1394         return true;
1395       for (auto MO : MI.implicit_operands())
1396         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1397           return true;
1398     }
1399     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1400         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1401       return true;
1402     return false;
1403   };
1404 
1405   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1406       std::numeric_limits<int>::max())
1407     return false;
1408 
1409   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1410           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1411       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1412   return true;
1413 }
1414 
1415 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1416                                                  const GCNSubtarget &ST) {
1417   if (!ST.hasLdsBranchVmemWARHazard())
1418     return false;
1419 
1420   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1421   // instructions need to appear in the same function.
1422   bool HasLds = false;
1423   bool HasVmem = false;
1424   for (auto &MBB : MF) {
1425     for (auto &MI : MBB) {
1426       HasLds |= SIInstrInfo::isDS(MI);
1427       HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
1428                  SIInstrInfo::isSegmentSpecificFLAT(MI);
1429       if (HasLds && HasVmem)
1430         return true;
1431     }
1432   }
1433   return false;
1434 }
1435 
1436 static bool isStoreCountWaitZero(const MachineInstr &I) {
1437   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1438          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1439          !I.getOperand(1).getImm();
1440 }
1441 
1442 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1443   if (!RunLdsBranchVmemWARHazardFixup)
1444     return false;
1445 
1446   assert(ST.hasLdsBranchVmemWARHazard());
1447   assert(!ST.hasExtendedWaitCounts());
1448 
1449   auto IsHazardInst = [](const MachineInstr &MI) {
1450     if (SIInstrInfo::isDS(MI))
1451       return 1;
1452     if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
1453         SIInstrInfo::isSegmentSpecificFLAT(MI))
1454       return 2;
1455     return 0;
1456   };
1457 
1458   auto InstType = IsHazardInst(*MI);
1459   if (!InstType)
1460     return false;
1461 
1462   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1463     return IsHazardInst(I) || isStoreCountWaitZero(I);
1464   };
1465 
1466   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1467     if (!I.isBranch())
1468       return false;
1469 
1470     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1471       auto InstType2 = IsHazardInst(I);
1472       return InstType2 && InstType != InstType2;
1473     };
1474 
1475     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1476       auto InstType2 = IsHazardInst(I);
1477       if (InstType == InstType2)
1478         return true;
1479 
1480       return isStoreCountWaitZero(I);
1481     };
1482 
1483     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1484            std::numeric_limits<int>::max();
1485   };
1486 
1487   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1488       std::numeric_limits<int>::max())
1489     return false;
1490 
1491   const SIInstrInfo *TII = ST.getInstrInfo();
1492   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1493           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1494     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1495     .addImm(0);
1496 
1497   return true;
1498 }
1499 
1500 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1501   if (!SIInstrInfo::isLDSDIR(*MI))
1502     return false;
1503 
1504   const int NoHazardWaitStates = 15;
1505   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1506   const Register VDSTReg = VDST->getReg();
1507 
1508   bool VisitedTrans = false;
1509   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1510     if (!SIInstrInfo::isVALU(I))
1511       return false;
1512     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1513     // Cover both WAR and WAW
1514     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1515   };
1516   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1517     if (WaitStates >= NoHazardWaitStates)
1518       return true;
1519     // Instructions which cause va_vdst==0 expire hazard
1520     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
1521            SIInstrInfo::isEXP(I);
1522   };
1523   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1524     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1525   };
1526 
1527   DenseSet<const MachineBasicBlock *> Visited;
1528   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1529                                     std::next(MI->getReverseIterator()), 0,
1530                                     IsExpiredFn, Visited, GetWaitStatesFn);
1531 
1532   // Transcendentals can execute in parallel to other VALUs.
1533   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1534   if (VisitedTrans)
1535     Count = 0;
1536 
1537   MachineOperand *WaitVdstOp =
1538       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1539   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1540 
1541   return true;
1542 }
1543 
1544 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1545   if (!SIInstrInfo::isLDSDIR(*MI))
1546     return false;
1547 
1548   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1549   const Register VDSTReg = VDST->getReg();
1550 
1551   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1552     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I))
1553       return false;
1554     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1555   };
1556   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1557   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1558   // according to the type of VMEM instruction.
1559   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1560     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1561            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1562            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1563             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1564            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1565             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1566   };
1567 
1568   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1569       std::numeric_limits<int>::max())
1570     return false;
1571 
1572   if (LdsdirCanWait) {
1573     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1574   } else {
1575     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1576             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1577         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1578   }
1579 
1580   return true;
1581 }
1582 
1583 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1584   if (!ST.hasVALUPartialForwardingHazard())
1585     return false;
1586   assert(!ST.hasExtendedWaitCounts());
1587 
1588   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1589     return false;
1590 
1591   SmallSetVector<Register, 4> SrcVGPRs;
1592 
1593   for (const MachineOperand &Use : MI->explicit_uses()) {
1594     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1595       SrcVGPRs.insert(Use.getReg());
1596   }
1597 
1598   // Only applies with >= 2 unique VGPR sources
1599   if (SrcVGPRs.size() <= 1)
1600     return false;
1601 
1602   // Look for the following pattern:
1603   //   Va <- VALU [PreExecPos]
1604   //   intv1
1605   //   Exec <- SALU [ExecPos]
1606   //   intv2
1607   //   Vb <- VALU [PostExecPos]
1608   //   intv3
1609   //   MI Va, Vb (WaitState = 0)
1610   //
1611   // Where:
1612   // intv1 + intv2 <= 2 VALUs
1613   // intv3 <= 4 VALUs
1614   //
1615   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1616 
1617   const int Intv1plus2MaxVALUs = 2;
1618   const int Intv3MaxVALUs = 4;
1619   const int IntvMaxVALUs = 6;
1620   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1621 
1622   struct StateType {
1623     SmallDenseMap<Register, int, 4> DefPos;
1624     int ExecPos = std::numeric_limits<int>::max();
1625     int VALUs = 0;
1626   };
1627 
1628   StateType State;
1629 
1630   // This overloads expiry testing with all the hazard detection
1631   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1632     // Too many VALU states have passed
1633     if (State.VALUs > NoHazardVALUWaitStates)
1634       return HazardExpired;
1635 
1636     // Instructions which cause va_vdst==0 expire hazard
1637     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
1638         SIInstrInfo::isEXP(I) ||
1639         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1640          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1641       return HazardExpired;
1642 
1643     // Track registers writes
1644     bool Changed = false;
1645     if (SIInstrInfo::isVALU(I)) {
1646       for (Register Src : SrcVGPRs) {
1647         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1648           State.DefPos[Src] = State.VALUs;
1649           Changed = true;
1650         }
1651       }
1652     } else if (SIInstrInfo::isSALU(I)) {
1653       if (State.ExecPos == std::numeric_limits<int>::max()) {
1654         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1655           State.ExecPos = State.VALUs;
1656           Changed = true;
1657         }
1658       }
1659     }
1660 
1661     // Early expiration: too many VALUs in intv3
1662     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1663       return HazardExpired;
1664 
1665     // Only evaluate state if something changed
1666     if (!Changed)
1667       return NoHazardFound;
1668 
1669     // Determine positions of VALUs pre/post exec change
1670     if (State.ExecPos == std::numeric_limits<int>::max())
1671       return NoHazardFound;
1672 
1673     int PreExecPos = std::numeric_limits<int>::max();
1674     int PostExecPos = std::numeric_limits<int>::max();
1675 
1676     for (auto Entry : State.DefPos) {
1677       int DefVALUs = Entry.second;
1678       if (DefVALUs != std::numeric_limits<int>::max()) {
1679         if (DefVALUs >= State.ExecPos)
1680           PreExecPos = std::min(PreExecPos, DefVALUs);
1681         else
1682           PostExecPos = std::min(PostExecPos, DefVALUs);
1683       }
1684     }
1685 
1686     // Need a VALUs post exec change
1687     if (PostExecPos == std::numeric_limits<int>::max())
1688       return NoHazardFound;
1689 
1690     // Too many VALUs in intv3?
1691     int Intv3VALUs = PostExecPos;
1692     if (Intv3VALUs > Intv3MaxVALUs)
1693       return HazardExpired;
1694 
1695     // Too many VALUs in intv2?
1696     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1697     if (Intv2VALUs > Intv1plus2MaxVALUs)
1698       return HazardExpired;
1699 
1700     // Need a VALUs pre exec change
1701     if (PreExecPos == std::numeric_limits<int>::max())
1702       return NoHazardFound;
1703 
1704     // Too many VALUs in intv1?
1705     int Intv1VALUs = PreExecPos - State.ExecPos;
1706     if (Intv1VALUs > Intv1plus2MaxVALUs)
1707       return HazardExpired;
1708 
1709     // Too many VALUs in intv1 + intv2
1710     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1711       return HazardExpired;
1712 
1713     return HazardFound;
1714   };
1715   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1716     if (SIInstrInfo::isVALU(MI))
1717       State.VALUs += 1;
1718   };
1719 
1720   DenseSet<const MachineBasicBlock *> Visited;
1721   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1722                             std::next(MI->getReverseIterator()), Visited))
1723     return false;
1724 
1725   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1726           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1727       .addImm(0x0fff);
1728 
1729   return true;
1730 }
1731 
1732 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1733   if (!ST.hasVALUTransUseHazard())
1734     return false;
1735   assert(!ST.hasExtendedWaitCounts());
1736 
1737   if (!SIInstrInfo::isVALU(*MI))
1738     return false;
1739 
1740   SmallSet<Register, 4> SrcVGPRs;
1741 
1742   for (const MachineOperand &Use : MI->explicit_uses()) {
1743     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1744       SrcVGPRs.insert(Use.getReg());
1745   }
1746 
1747   // Look for the following pattern:
1748   //   Va <- TRANS VALU
1749   //   intv
1750   //   MI Va (WaitState = 0)
1751   //
1752   // Where:
1753   // intv <= 5 VALUs / 1 TRANS
1754   //
1755   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1756 
1757   const int IntvMaxVALUs = 5;
1758   const int IntvMaxTRANS = 1;
1759 
1760   struct StateType {
1761     int VALUs = 0;
1762     int TRANS = 0;
1763   };
1764 
1765   StateType State;
1766 
1767   // This overloads expiry testing with all the hazard detection
1768   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1769     // Too many VALU states have passed
1770     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1771       return HazardExpired;
1772 
1773     // Instructions which cause va_vdst==0 expire hazard
1774     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
1775         SIInstrInfo::isEXP(I) ||
1776         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1777          I.getOperand(0).getImm() == 0x0fff))
1778       return HazardExpired;
1779 
1780     // Track registers writes
1781     if (SIInstrInfo::isTRANS(I)) {
1782       for (Register Src : SrcVGPRs) {
1783         if (I.modifiesRegister(Src, &TRI)) {
1784           return HazardFound;
1785         }
1786       }
1787     }
1788 
1789     return NoHazardFound;
1790   };
1791   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1792     if (SIInstrInfo::isVALU(MI))
1793       State.VALUs += 1;
1794     if (SIInstrInfo::isTRANS(MI))
1795       State.TRANS += 1;
1796   };
1797 
1798   DenseSet<const MachineBasicBlock *> Visited;
1799   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1800                             std::next(MI->getReverseIterator()), Visited))
1801     return false;
1802 
1803   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1804   // avoided.
1805   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1806           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1807       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1808 
1809   return true;
1810 }
1811 
1812 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1813   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
1814     return false;
1815 
1816   const SIInstrInfo *TII = ST.getInstrInfo();
1817   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1818 
1819   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1820     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
1821       return false;
1822 
1823     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1824     // with the dest(matrix D) of the previous wmma.
1825     const Register CurSrc0Reg =
1826         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1827     const Register CurSrc1Reg =
1828         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1829 
1830     const Register PrevDstReg =
1831         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1832 
1833     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1834         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1835       return true;
1836     }
1837 
1838     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1839     // but Index can't overlap with PrevDstReg.
1840     if (AMDGPU::isGFX12Plus(ST)) {
1841       if (SIInstrInfo::isSWMMAC(*MI)) {
1842         const Register CurIndex =
1843             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1844         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1845           return true;
1846       }
1847       return false;
1848     }
1849 
1850     return false;
1851   };
1852 
1853   auto IsExpiredFn = [](const MachineInstr &I, int) {
1854     return SIInstrInfo::isVALU(I);
1855   };
1856 
1857   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1858       std::numeric_limits<int>::max())
1859     return false;
1860 
1861   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1862 
1863   return true;
1864 }
1865 
1866 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1867   if (!ST.hasShift64HighRegBug())
1868     return false;
1869   assert(!ST.hasExtendedWaitCounts());
1870 
1871   switch (MI->getOpcode()) {
1872   default:
1873     return false;
1874   case AMDGPU::V_LSHLREV_B64_e64:
1875   case AMDGPU::V_LSHRREV_B64_e64:
1876   case AMDGPU::V_ASHRREV_I64_e64:
1877     break;
1878   }
1879 
1880   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1881   if (!Amt->isReg())
1882     return false;
1883 
1884   Register AmtReg = Amt->getReg();
1885   const MachineRegisterInfo &MRI = MF.getRegInfo();
1886   // Check if this is a last VGPR in the allocation block.
1887   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1888     return false;
1889 
1890   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1891     return false;
1892 
1893   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1894   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1895   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1896   bool Overlapped = OverlappedSrc || OverlappedDst;
1897 
1898   assert(!OverlappedDst || !OverlappedSrc ||
1899          Src1->getReg() == MI->getOperand(0).getReg());
1900   assert(ST.needsAlignedVGPRs());
1901   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1902 
1903   Register NewReg;
1904   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1905                                    : AMDGPU::VGPR_32RegClass) {
1906     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1907       NewReg = Reg;
1908       break;
1909     }
1910   }
1911 
1912   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1913                                : NewReg;
1914   Register NewAmtLo;
1915 
1916   if (Overlapped)
1917     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1918 
1919   DebugLoc DL = MI->getDebugLoc();
1920   MachineBasicBlock *MBB = MI->getParent();
1921   // Insert a full wait count because found register might be pending a wait.
1922   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1923       .addImm(0);
1924 
1925   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1926   if (Overlapped)
1927     runOnInstruction(
1928         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1929             .addDef(AmtReg - 1)
1930             .addReg(AmtReg - 1, RegState::Undef)
1931             .addReg(NewAmtLo, RegState::Undef));
1932   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1933                        .addDef(AmtReg)
1934                        .addReg(AmtReg, RegState::Undef)
1935                        .addReg(NewAmt, RegState::Undef));
1936 
1937   // Instructions emitted after the current instruction will be processed by the
1938   // parent loop of the hazard recognizer in a natural way.
1939   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1940           AmtReg)
1941       .addDef(NewAmt)
1942       .addReg(NewAmt)
1943       .addReg(AmtReg);
1944   if (Overlapped)
1945     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1946             AmtReg - 1)
1947         .addDef(NewAmtLo)
1948         .addReg(NewAmtLo)
1949         .addReg(AmtReg - 1);
1950 
1951   // Re-running hazard recognizer on the modified instruction is not necessary,
1952   // inserted V_SWAP_B32 has already both read and write new registers so
1953   // hazards related to these register has already been handled.
1954   Amt->setReg(NewAmt);
1955   Amt->setIsKill(false);
1956   // We do not update liveness, so verifier may see it as undef.
1957   Amt->setIsUndef();
1958   if (OverlappedDst)
1959     MI->getOperand(0).setReg(NewReg);
1960   if (OverlappedSrc) {
1961     Src1->setReg(NewReg);
1962     Src1->setIsKill(false);
1963     Src1->setIsUndef();
1964   }
1965 
1966   return true;
1967 }
1968 
1969 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1970   int NSAtoVMEMWaitStates = 1;
1971 
1972   if (!ST.hasNSAtoVMEMBug())
1973     return 0;
1974 
1975   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1976     return 0;
1977 
1978   const SIInstrInfo *TII = ST.getInstrInfo();
1979   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1980   if (!Offset || (Offset->getImm() & 6) == 0)
1981     return 0;
1982 
1983   auto IsHazardFn = [TII](const MachineInstr &I) {
1984     if (!SIInstrInfo::isMIMG(I))
1985       return false;
1986     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1987     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1988            TII->getInstSizeInBytes(I) >= 16;
1989   };
1990 
1991   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1992 }
1993 
1994 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1995   int FPAtomicToDenormModeWaitStates = 3;
1996 
1997   if (!ST.hasFPAtomicToDenormModeHazard())
1998     return 0;
1999   assert(!ST.hasExtendedWaitCounts());
2000 
2001   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2002     return 0;
2003 
2004   auto IsHazardFn = [](const MachineInstr &I) {
2005     if (!SIInstrInfo::isVMEM(I))
2006       return false;
2007     return SIInstrInfo::isFPAtomic(I);
2008   };
2009 
2010   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2011     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2012       return true;
2013 
2014     switch (MI.getOpcode()) {
2015     case AMDGPU::S_WAITCNT:
2016     case AMDGPU::S_WAITCNT_VSCNT:
2017     case AMDGPU::S_WAITCNT_VMCNT:
2018     case AMDGPU::S_WAITCNT_EXPCNT:
2019     case AMDGPU::S_WAITCNT_LGKMCNT:
2020     case AMDGPU::S_WAIT_IDLE:
2021       return true;
2022     default:
2023       break;
2024     }
2025 
2026     return false;
2027   };
2028 
2029   return FPAtomicToDenormModeWaitStates -
2030          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2031 }
2032 
2033 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2034   assert(SIInstrInfo::isMAI(*MI));
2035 
2036   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2037 }
2038 
2039 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2040   // Early exit if no padding is requested.
2041   if (MFMAPaddingRatio == 0)
2042     return 0;
2043 
2044   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2045   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2046     return 0;
2047 
2048   int NeighborMFMALatency = 0;
2049   auto IsNeighboringMFMA = [&NeighborMFMALatency,
2050                             this](const MachineInstr &MI) {
2051     if (!SIInstrInfo::isMFMA(MI))
2052       return false;
2053 
2054     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2055     return true;
2056   };
2057 
2058   const int MaxMFMAPipelineWaitStates = 16;
2059   int WaitStatesSinceNeighborMFMA =
2060       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2061 
2062   int NeighborMFMAPaddingNeeded =
2063       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2064       WaitStatesSinceNeighborMFMA;
2065 
2066   return std::max(0, NeighborMFMAPaddingNeeded);
2067 }
2068 
2069 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2070   int WaitStatesNeeded = 0;
2071   unsigned Opc = MI->getOpcode();
2072 
2073   auto IsVALUFn = [](const MachineInstr &MI) {
2074     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2075   };
2076 
2077   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2078     const int LegacyVALUWritesVGPRWaitStates = 2;
2079     const int VALUWritesExecWaitStates = 4;
2080     const int MaxWaitStates = 4;
2081 
2082     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2083       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2084     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2085 
2086     if (WaitStatesNeeded < MaxWaitStates) {
2087       for (const MachineOperand &Use : MI->explicit_uses()) {
2088         const int MaxWaitStates = 2;
2089 
2090         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2091           continue;
2092 
2093         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2094           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2095         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2096 
2097         if (WaitStatesNeeded == MaxWaitStates)
2098           break;
2099       }
2100     }
2101   }
2102 
2103   for (const MachineOperand &Op : MI->explicit_operands()) {
2104     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2105       continue;
2106 
2107     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2108       continue;
2109 
2110     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2111     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2112     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2113     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2114     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2115     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2116     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2117     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2118     const int MaxWaitStates = 18;
2119     Register Reg = Op.getReg();
2120     unsigned HazardDefLatency = 0;
2121 
2122     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2123                                this](const MachineInstr &MI) {
2124       if (!SIInstrInfo::isMFMA(MI))
2125         return false;
2126       Register DstReg = MI.getOperand(0).getReg();
2127       if (DstReg == Reg)
2128         return false;
2129       HazardDefLatency =
2130           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2131       return TRI.regsOverlap(DstReg, Reg);
2132     };
2133 
2134     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2135                                                    MaxWaitStates);
2136     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2137     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2138     int OpNo = Op.getOperandNo();
2139     if (OpNo == SrcCIdx) {
2140       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2141     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2142       switch (HazardDefLatency) {
2143       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2144                break;
2145       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2146                break;
2147       case 16: [[fallthrough]];
2148       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2149                break;
2150       }
2151     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2152       switch (HazardDefLatency) {
2153       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2154                break;
2155       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2156                break;
2157       case 16: [[fallthrough]];
2158       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2159                break;
2160       }
2161     }
2162 
2163     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2164     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2165 
2166     if (WaitStatesNeeded == MaxWaitStates)
2167       return WaitStatesNeeded; // Early exit.
2168 
2169     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2170       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2171         return false;
2172       Register DstReg = MI.getOperand(0).getReg();
2173       return TRI.regsOverlap(Reg, DstReg);
2174     };
2175 
2176     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2177     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2178     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2179     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2180     if (OpNo == SrcCIdx)
2181       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2182     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2183       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2184 
2185     WaitStatesNeededForUse = NeedWaitStates -
2186       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2187     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2188 
2189     if (WaitStatesNeeded == MaxWaitStates)
2190       return WaitStatesNeeded; // Early exit.
2191   }
2192 
2193   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2194     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2195     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2196     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2197     const int MaxWaitStates = 13;
2198     Register DstReg = MI->getOperand(0).getReg();
2199     unsigned HazardDefLatency = 0;
2200 
2201     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2202                          this](const MachineInstr &MI) {
2203       if (!SIInstrInfo::isMFMA(MI))
2204         return false;
2205       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2206       HazardDefLatency =
2207           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2208       return TRI.regsOverlap(Reg, DstReg);
2209     };
2210 
2211     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2212     int NeedWaitStates;
2213     switch (HazardDefLatency) {
2214     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2215              break;
2216     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2217              break;
2218     case 16: [[fallthrough]];
2219     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2220              break;
2221     }
2222 
2223     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2224     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2225   }
2226 
2227   // Pad neighboring MFMA with noops for better inter-wave performance.
2228   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2229 
2230   return WaitStatesNeeded;
2231 }
2232 
2233 static int
2234 GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2235                                                               bool IsGFX950) {
2236   // xdl def cycles | gfx940 | gfx950
2237   // 2 pass         |  3        4
2238   // 4 pass         |  5        6
2239   // 8 pass         |  9        10
2240   // 16 pass        |  17       18
2241   return NumPasses + 1 + IsGFX950;
2242 }
2243 
2244 static int
2245 GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2246                                                               bool IsGFX950) {
2247   // xdl def cycles | gfx940 | gfx950
2248   // 2 pass         |  3        3
2249   // 4 pass         |  5        6
2250   // 8 pass         |  9        10
2251   // 16 pass        |  17       18
2252   return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2253 }
2254 
2255 static int
2256 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2257   // 2 pass -> 2
2258   // 4 pass -> 4
2259   // 8 pass -> 8
2260   // 16 pass -> 16
2261   return NumPasses;
2262 }
2263 
2264 static int
2265 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2266   // 2 pass -> 4
2267   // 4 pass -> 6
2268   // 8 pass -> 10
2269   // 16 pass -> 18
2270   return NumPasses + 2;
2271 }
2272 
2273 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2274                                                                 bool IsGFX950) {
2275   // xdl def cycles | gfx942 | gfx950
2276   // 2 pass         |  5        5
2277   // 4 pass         |  7        8
2278   // 8 pass         |  11       12
2279   // 16 pass        |  19       20
2280   return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2281 }
2282 
2283 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2284   int WaitStatesNeeded = 0;
2285   unsigned Opc = MI->getOpcode();
2286 
2287   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2288     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2289   };
2290 
2291   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2292     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2293            !SIInstrInfo::isDOT(MI);
2294   };
2295 
2296   if (!SIInstrInfo::isMFMA(*MI))
2297     return WaitStatesNeeded;
2298 
2299   const int VALUWritesExecWaitStates = 4;
2300   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2301     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2302                           VALUWritesExecWaitStates);
2303   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2304 
2305   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2306 
2307   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2308   for (const MachineOperand &Use : MI->explicit_uses()) {
2309     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2310     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2311     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2312     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2313     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2314     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2315     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2316     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2317     const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2318     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2319     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2320     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2321     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2322     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2323     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2324     const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2325     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2326     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2327     const int MaxWaitStates = 19;
2328 
2329     if (!Use.isReg())
2330       continue;
2331     Register Reg = Use.getReg();
2332     bool FullReg;
2333     const MachineInstr *MI1;
2334 
2335     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2336                                this](const MachineInstr &MI) {
2337       if (!SIInstrInfo::isMFMA(MI))
2338         return false;
2339       Register DstReg = MI.getOperand(0).getReg();
2340       FullReg = (DstReg == Reg);
2341       MI1 = &MI;
2342       return TRI.regsOverlap(DstReg, Reg);
2343     };
2344 
2345     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2346       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2347     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2348 
2349     int NumWaitStates =
2350         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2351     if (NumWaitStates == std::numeric_limits<int>::max())
2352       continue;
2353 
2354     int OpNo = Use.getOperandNo();
2355     unsigned Opc1 = MI1->getOpcode();
2356     int NeedWaitStates = 0;
2357     if (OpNo == SrcCIdx) {
2358       if (!SIInstrInfo::isDGEMM(Opc) &&
2359           (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2360         NeedWaitStates = 0;
2361       } else if (FullReg) {
2362         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2363              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2364             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2365              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2366           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2367         else if (ST.hasGFX940Insts() &&
2368                  TSchedModel.computeInstrLatency(MI1) == 2)
2369           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2370       } else {
2371         switch (Opc1) {
2372         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2373         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2374         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2375         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2376           if (!TII.isXDL(*MI))
2377             NeedWaitStates =
2378                 ST.hasGFX950Insts()
2379                     ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2380                     : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2381           break;
2382         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2383         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2384           if (!TII.isXDL(*MI))
2385             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2386           break;
2387         default:
2388           int NumPasses = TSchedModel.computeInstrLatency(MI1);
2389           if (ST.hasGFX940Insts()) {
2390             if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2391               break;
2392 
2393             NeedWaitStates =
2394                 TII.isXDL(*MI1)
2395                     ? (TII.isXDL(*MI)
2396                            ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2397                                  NumPasses, ST.hasGFX950Insts())
2398                            : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2399                                  NumPasses, ST.hasGFX950Insts()))
2400                     : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2401                           NumPasses);
2402             break;
2403           }
2404 
2405           switch (NumPasses) {
2406           case 2:
2407             NeedWaitStates =
2408                 SIInstrInfo::isDGEMM(Opc)
2409                     ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2410                     : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2411             break;
2412           case 8:
2413             NeedWaitStates =
2414                 SIInstrInfo::isDGEMM(Opc)
2415                     ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2416                     : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2417             break;
2418           case 16:
2419             NeedWaitStates =
2420                 SIInstrInfo::isDGEMM(Opc)
2421                     ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2422                     : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2423             break;
2424           default:
2425             llvm_unreachable("unexpected number of passes");
2426           }
2427         }
2428       }
2429     } else {
2430       switch (Opc1) {
2431       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2432       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2433       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2434       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2435         NeedWaitStates =
2436             ST.hasGFX950Insts()
2437                 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2438                 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2439         break;
2440       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2441       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2442         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2443         break;
2444       default:
2445         int NumPasses = TSchedModel.computeInstrLatency(MI1);
2446 
2447         if (ST.hasGFX940Insts()) {
2448           NeedWaitStates =
2449               TII.isXDL(*MI1)
2450                   ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2451                         NumPasses, ST.hasGFX950Insts())
2452                   : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2453                         NumPasses);
2454           break;
2455         }
2456 
2457         switch (NumPasses) {
2458         case 2:
2459           NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2460           break;
2461         case 4:
2462           llvm_unreachable("unexpected number of passes for mfma");
2463         case 8:
2464           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2465           break;
2466         case 16:
2467         default:
2468           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2469         }
2470       }
2471     }
2472     if (WaitStatesNeeded >= NeedWaitStates)
2473       continue;
2474 
2475     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2476     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2477 
2478     if (WaitStatesNeeded == MaxWaitStates)
2479       break;
2480   }
2481 
2482   // Pad neighboring MFMA with noops for better inter-wave performance.
2483   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2484 
2485   return WaitStatesNeeded;
2486 }
2487 
2488 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2489   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2490   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2491     return 0;
2492 
2493   int WaitStatesNeeded = 0;
2494 
2495   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2496     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2497   };
2498 
2499   for (const MachineOperand &Op : MI->explicit_uses()) {
2500     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2501       continue;
2502 
2503     Register Reg = Op.getReg();
2504 
2505     const int AccVgprReadLdStWaitStates = 2;
2506     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2507     const int MaxWaitStates = 2;
2508 
2509     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2510       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2511     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2512 
2513     if (WaitStatesNeeded == MaxWaitStates)
2514       return WaitStatesNeeded; // Early exit.
2515 
2516     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2517       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2518           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2519         return false;
2520       auto IsVALUFn = [](const MachineInstr &MI) {
2521         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2522       };
2523       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2524              std::numeric_limits<int>::max();
2525     };
2526 
2527     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2528       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2529     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2530   }
2531 
2532   return WaitStatesNeeded;
2533 }
2534 
2535 int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2536   assert(!ST.hasVcmpxPermlaneHazard() &&
2537          "this is a different vcmpx+permlane hazard");
2538   const SIRegisterInfo *TRI = ST.getRegisterInfo();
2539   const SIInstrInfo *TII = ST.getInstrInfo();
2540 
2541   auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2542     return isVCmpXWritesExec(*TII, *TRI, MI);
2543   };
2544 
2545   auto IsVALUFn = [](const MachineInstr &MI) {
2546     return SIInstrInfo::isVALU(MI);
2547   };
2548 
2549   const int VCmpXWritesExecWaitStates = 4;
2550   const int VALUWritesVDstWaitStates = 2;
2551   int WaitStatesNeeded = 0;
2552 
2553   for (const MachineOperand &Op : MI->explicit_uses()) {
2554     if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2555       continue;
2556     Register Reg = Op.getReg();
2557 
2558     int WaitStatesSinceDef =
2559         VALUWritesVDstWaitStates -
2560         getWaitStatesSinceDef(Reg, IsVALUFn,
2561                               /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2562     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2563     if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2564       break;
2565   }
2566 
2567   int VCmpXHazardWaits =
2568       VCmpXWritesExecWaitStates -
2569       getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2570 
2571   WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2572   return WaitStatesNeeded;
2573 }
2574 
2575 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2576   // 2 pass -> 4
2577   // 4 pass -> 6
2578   // 8 pass -> 10
2579   // 16 pass -> 18
2580   return NumPasses + 2;
2581 }
2582 
2583 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
2584                                                        bool IsGFX950) {
2585   // xdl def cycles | gfx942 | gfx950
2586   // 2 pass         |  5        5
2587   // 4 pass         |  7        8
2588   // 8 pass         |  11       12
2589   // 16 pass        |  19       20
2590   return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2591 }
2592 
2593 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2594                                                               bool IsGFX950) {
2595   // xdl def cycles | gfx942 | gfx950
2596   // 2 pass         |  5        5
2597   // 4 pass         |  7        8
2598   // 8 pass         |  11       12
2599   // 16 pass        |  19       20
2600   return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2601 }
2602 
2603 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2604   // 2 pass -> 4
2605   // 4 pass -> 6
2606   // 8 pass -> 10
2607   // 16 pass -> 18
2608   return NumPasses + 2;
2609 }
2610 
2611 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2612   if (!ST.hasGFX90AInsts())
2613     return 0;
2614 
2615   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2616     return SIInstrInfo::isDGEMM(MI.getOpcode());
2617   };
2618 
2619   // This is checked in checkMAIHazards90A()
2620   if (SIInstrInfo::isMFMA(*MI))
2621     return 0;
2622 
2623   const MachineRegisterInfo &MRI = MF.getRegInfo();
2624 
2625   int WaitStatesNeeded = 0;
2626 
2627   bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2628   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2629   bool IsVALU = SIInstrInfo::isVALU(*MI);
2630 
2631   const MachineInstr *MFMA = nullptr;
2632   unsigned Reg;
2633   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2634     if (!SIInstrInfo::isMFMA(MI) ||
2635         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2636       return false;
2637     MFMA = &MI;
2638     return true;
2639   };
2640 
2641   const MachineInstr *DOT = nullptr;
2642   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2643     if (!SIInstrInfo::isDOT(MI) ||
2644         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2645       return false;
2646     DOT = &MI;
2647     return true;
2648   };
2649 
2650   bool DGEMMAfterVALUWrite = false;
2651   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2652     // Found DGEMM on reverse traversal to def.
2653     if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2654       DGEMMAfterVALUWrite = true;
2655 
2656     // Only hazard if register is defined by a VALU and a DGEMM is found after
2657     // after the def.
2658     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2659       return false;
2660 
2661     return true;
2662   };
2663 
2664   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2665                                            AMDGPU::OpName::src2);
2666 
2667   if (IsMemOrExport || IsVALU) {
2668     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2669     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2670     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2671     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2672     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2673     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2674     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2675     const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2676     const int DotWriteSameDotReadSrcAB = 3;
2677     const int DotWriteDifferentVALURead = 3;
2678     const int DMFMABetweenVALUWriteVMEMRead = 2;
2679     const int MaxWaitStates = 19;
2680 
2681     for (const MachineOperand &Use : MI->explicit_uses()) {
2682       if (!Use.isReg())
2683         continue;
2684       Reg = Use.getReg();
2685 
2686       DOT = nullptr;
2687       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2688                                                      MaxWaitStates);
2689       if (DOT) {
2690         int NeedWaitStates = 0;
2691         if (DOT->getOpcode() == MI->getOpcode()) {
2692           if (&Use - &MI->getOperand(0) != SrcCIdx)
2693             NeedWaitStates = DotWriteSameDotReadSrcAB;
2694         } else {
2695           NeedWaitStates = DotWriteDifferentVALURead;
2696         }
2697 
2698         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2699         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2700       }
2701 
2702       // Workaround for HW data hazard bug observed only in GFX90A. When there
2703       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2704       // causes the SQ to incorrectly not insert two wait states between the two
2705       // instructions needed to avoid data hazard.
2706       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2707         DGEMMAfterVALUWrite = false;
2708         if (TRI.isVectorRegister(MRI, Reg)) {
2709           int WaitStatesNeededForUse =
2710                 DMFMABetweenVALUWriteVMEMRead -
2711                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2712                                       DMFMABetweenVALUWriteVMEMRead);
2713 
2714           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2715         }
2716       }
2717 
2718       MFMA = nullptr;
2719       WaitStatesSinceDef =
2720           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2721       if (!MFMA)
2722         continue;
2723 
2724       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2725       int NumPasses = HazardDefLatency;
2726       int NeedWaitStates = MaxWaitStates;
2727 
2728       if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
2729         switch (HazardDefLatency) {
2730         case 4:
2731           NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2732                                          : DMFMA4x4WriteVgprVALUReadWaitStates;
2733           break;
2734         case 8:
2735         case 16:
2736           NeedWaitStates =
2737               IsMemOrExport
2738                   ? DMFMA16x16WriteVgprMemExpReadWaitStates
2739                   : (ST.hasGFX950Insts()
2740                          ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2741                          : DMFMA16x16WriteVgprVALUReadWaitStates);
2742           break;
2743         default:
2744           llvm_unreachable("unexpected dgemm");
2745         }
2746       } else if (ST.hasGFX940Insts()) {
2747         NeedWaitStates =
2748             TII.isXDL(*MFMA)
2749                 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
2750                       NumPasses, ST.hasGFX950Insts())
2751                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2752                       NumPasses);
2753       } else {
2754         switch (HazardDefLatency) {
2755         case 2:
2756           NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2757           break;
2758         case 8:
2759           NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2760           break;
2761         case 16:
2762           NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2763           break;
2764         default:
2765           llvm_unreachable("unexpected number of passes for mfma");
2766         }
2767       }
2768 
2769       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2770       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2771 
2772       if (WaitStatesNeeded == MaxWaitStates)
2773         break;
2774     }
2775   }
2776 
2777   unsigned Opc = MI->getOpcode();
2778   const int DMFMAToFMA64WaitStates = 2;
2779   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2780        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2781        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2782       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2783     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2784       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2785     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2786   }
2787 
2788   if (!IsVALU && !IsMemOrExport)
2789     return WaitStatesNeeded;
2790 
2791   for (const MachineOperand &Def : MI->defs()) {
2792     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2793     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2794     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2795     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2796     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2797     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2798     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2799     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2800     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2801     const int DotWriteDifferentVALUWrite = 3;
2802     const int MaxWaitStates = 19;
2803     const int MaxWarWaitStates = 15;
2804 
2805     Reg = Def.getReg();
2806 
2807     DOT = nullptr;
2808     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2809                                                    MaxWaitStates);
2810     if (DOT && DOT->getOpcode() != MI->getOpcode())
2811       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2812                                                     WaitStatesSinceDef);
2813 
2814     MFMA = nullptr;
2815     WaitStatesSinceDef =
2816         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2817     if (MFMA) {
2818       int NeedWaitStates = MaxWaitStates;
2819       int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2820 
2821       if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
2822         switch (NumPasses) {
2823         case 4:
2824           NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2825           break;
2826         case 8:
2827         case 16:
2828           NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2829           break;
2830         default:
2831           llvm_unreachable("unexpected number of cycles for dgemm");
2832         }
2833       } else if (ST.hasGFX940Insts()) {
2834         NeedWaitStates =
2835             TII.isXDL(*MFMA)
2836                 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
2837                       NumPasses, ST.hasGFX950Insts())
2838                 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2839       } else {
2840         switch (NumPasses) {
2841         case 2:
2842           NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2843           break;
2844         case 8:
2845           NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2846           break;
2847         case 16:
2848           NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2849           break;
2850         default:
2851           llvm_unreachable("Unexpected number of passes for mfma");
2852         }
2853       }
2854 
2855       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2856       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2857 
2858       if (WaitStatesNeeded == MaxWaitStates)
2859         break;
2860     }
2861 
2862     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2863       if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
2864           !MI.readsRegister(Reg, &TRI))
2865         return false;
2866 
2867       if (ST.hasGFX940Insts() && !TII.isXDL(MI))
2868         return false;
2869 
2870       const MachineOperand *SrcC =
2871           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2872       assert(SrcC);
2873       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2874         return false;
2875 
2876       MFMA = &MI;
2877       return true;
2878     };
2879 
2880     MFMA = nullptr;
2881     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2882                                                 MaxWarWaitStates);
2883     if (!MFMA)
2884       continue;
2885 
2886     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2887     int NeedWaitStates = MaxWaitStates;
2888     switch (HazardDefLatency) {
2889     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2890              break;
2891     case 4:  assert(ST.hasGFX940Insts());
2892              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2893              break;
2894     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2895              break;
2896     case 16: [[fallthrough]];
2897     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2898              break;
2899     }
2900 
2901     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2902     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2903   }
2904 
2905   return WaitStatesNeeded;
2906 }
2907 
2908 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2909   if (!SU->isInstr())
2910     return false;
2911 
2912   const MachineInstr *MAI = nullptr;
2913 
2914   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2915     MAI = nullptr;
2916     if (SIInstrInfo::isMFMA(MI))
2917       MAI = &MI;
2918     return MAI != nullptr;
2919   };
2920 
2921   MachineInstr *MI = SU->getInstr();
2922   if (IsMFMAFn(*MI)) {
2923     int W = getWaitStatesSince(IsMFMAFn, 16);
2924     if (MAI)
2925       return W < (int)TSchedModel.computeInstrLatency(MAI);
2926   }
2927 
2928   return false;
2929 }
2930 
2931 // Adjust global offsets for instructions bundled with S_GETPC_B64 after
2932 // insertion of a new instruction.
2933 static void updateGetPCBundle(MachineInstr *NewMI) {
2934   if (!NewMI->isBundled())
2935     return;
2936 
2937   // Find start of bundle.
2938   auto I = NewMI->getIterator();
2939   while (I->isBundledWithPred())
2940     I--;
2941   if (I->isBundle())
2942     I++;
2943 
2944   // Bail if this is not an S_GETPC bundle.
2945   if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2946     return;
2947 
2948   // Update offsets of any references in the bundle.
2949   const unsigned NewBytes = 4;
2950   assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2951          "Unexpected instruction insertion in bundle");
2952   auto NextMI = std::next(NewMI->getIterator());
2953   auto End = NewMI->getParent()->end();
2954   while (NextMI != End && NextMI->isBundledWithPred()) {
2955     for (auto &Operand : NextMI->operands()) {
2956       if (Operand.isGlobal())
2957         Operand.setOffset(Operand.getOffset() + NewBytes);
2958     }
2959     NextMI++;
2960   }
2961 }
2962 
2963 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2964   if (!ST.hasVALUMaskWriteHazard())
2965     return false;
2966   assert(!ST.hasExtendedWaitCounts());
2967 
2968   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2969     return false;
2970 
2971   // The hazard sequence is three instructions:
2972   //   1. VALU reads SGPR as mask
2973   //   2. SALU writes SGPR
2974   //   3. SALU reads SGPR
2975   // The hazard can expire if the distance between 2 and 3 is sufficient.
2976   // In practice this happens <10% of the time, hence this always assumes
2977   // the hazard exists if 1 and 2 are present to avoid searching.
2978 
2979   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2980   if (!SDSTOp || !SDSTOp->isReg())
2981     return false;
2982 
2983   const Register HazardReg = SDSTOp->getReg();
2984   if (HazardReg == AMDGPU::EXEC ||
2985       HazardReg == AMDGPU::EXEC_LO ||
2986       HazardReg == AMDGPU::EXEC_HI ||
2987       HazardReg == AMDGPU::M0)
2988     return false;
2989 
2990   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2991     switch (I.getOpcode()) {
2992     case AMDGPU::V_ADDC_U32_e32:
2993     case AMDGPU::V_ADDC_U32_dpp:
2994     case AMDGPU::V_CNDMASK_B16_t16_e32:
2995     case AMDGPU::V_CNDMASK_B16_fake16_e32:
2996     case AMDGPU::V_CNDMASK_B16_t16_dpp:
2997     case AMDGPU::V_CNDMASK_B16_fake16_dpp:
2998     case AMDGPU::V_CNDMASK_B32_e32:
2999     case AMDGPU::V_CNDMASK_B32_dpp:
3000     case AMDGPU::V_DIV_FMAS_F32_e64:
3001     case AMDGPU::V_DIV_FMAS_F64_e64:
3002     case AMDGPU::V_SUBB_U32_e32:
3003     case AMDGPU::V_SUBB_U32_dpp:
3004     case AMDGPU::V_SUBBREV_U32_e32:
3005     case AMDGPU::V_SUBBREV_U32_dpp:
3006       // These implicitly read VCC as mask source.
3007       return HazardReg == AMDGPU::VCC ||
3008              HazardReg == AMDGPU::VCC_LO ||
3009              HazardReg == AMDGPU::VCC_HI;
3010     case AMDGPU::V_ADDC_U32_e64:
3011     case AMDGPU::V_ADDC_U32_e64_dpp:
3012     case AMDGPU::V_CNDMASK_B16_t16_e64:
3013     case AMDGPU::V_CNDMASK_B16_fake16_e64:
3014     case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3015     case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3016     case AMDGPU::V_CNDMASK_B32_e64:
3017     case AMDGPU::V_CNDMASK_B32_e64_dpp:
3018     case AMDGPU::V_SUBB_U32_e64:
3019     case AMDGPU::V_SUBB_U32_e64_dpp:
3020     case AMDGPU::V_SUBBREV_U32_e64:
3021     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3022       // Only check mask register overlaps.
3023       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3024       assert(SSRCOp);
3025       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3026     }
3027     default:
3028       return false;
3029     }
3030   };
3031 
3032   const MachineRegisterInfo &MRI = MF.getRegInfo();
3033   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3034     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3035     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3036         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3037       return true;
3038 
3039     // VALU access to any SGPR or literal constant other than HazardReg
3040     // mitigates hazard. No need to check HazardReg here as this will
3041     // only be called when !IsHazardFn.
3042     if (!SIInstrInfo::isVALU(I))
3043       return false;
3044     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3045       const MachineOperand &Op = I.getOperand(OpNo);
3046       if (Op.isReg()) {
3047         Register OpReg = Op.getReg();
3048         // Only consider uses
3049         if (!Op.isUse())
3050           continue;
3051         // Ignore EXEC
3052         if (OpReg == AMDGPU::EXEC ||
3053             OpReg == AMDGPU::EXEC_LO ||
3054             OpReg == AMDGPU::EXEC_HI)
3055           continue;
3056         // Ignore all implicit uses except VCC
3057         if (Op.isImplicit()) {
3058           if (OpReg == AMDGPU::VCC ||
3059               OpReg == AMDGPU::VCC_LO ||
3060               OpReg == AMDGPU::VCC_HI)
3061             return true;
3062           continue;
3063         }
3064         if (TRI.isSGPRReg(MRI, OpReg))
3065           return true;
3066       } else {
3067         const MCInstrDesc &InstDesc = I.getDesc();
3068         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3069         if (!TII.isInlineConstant(Op, OpInfo))
3070           return true;
3071       }
3072     }
3073     return false;
3074   };
3075 
3076   // Check for hazard
3077   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3078       std::numeric_limits<int>::max())
3079     return false;
3080 
3081   auto NextMI = std::next(MI->getIterator());
3082 
3083   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3084   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3085                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3086                    .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3087 
3088   // SALU write may be s_getpc in a bundle.
3089   updateGetPCBundle(NewMI);
3090 
3091   return true;
3092 }
3093 
3094 static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3095                                const SIInstrInfo &TII) {
3096   MachineBasicBlock &EntryMBB = MF->front();
3097   if (EntryMBB.begin() != EntryMBB.end()) {
3098     auto &EntryMI = *EntryMBB.begin();
3099     if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3100         EntryMI.getOperand(0).getImm() >= Priority)
3101       return false;
3102   }
3103 
3104   BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3105       .addImm(Priority);
3106   return true;
3107 }
3108 
3109 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3110   if (!ST.hasRequiredExportPriority())
3111     return false;
3112 
3113   // Assume the following shader types will never have exports,
3114   // and avoid adding or adjusting S_SETPRIO.
3115   MachineBasicBlock *MBB = MI->getParent();
3116   MachineFunction *MF = MBB->getParent();
3117   auto CC = MF->getFunction().getCallingConv();
3118   switch (CC) {
3119   case CallingConv::AMDGPU_CS:
3120   case CallingConv::AMDGPU_CS_Chain:
3121   case CallingConv::AMDGPU_CS_ChainPreserve:
3122   case CallingConv::AMDGPU_KERNEL:
3123     return false;
3124   default:
3125     break;
3126   }
3127 
3128   const int MaxPriority = 3;
3129   const int NormalPriority = 2;
3130   const int PostExportPriority = 0;
3131 
3132   auto It = MI->getIterator();
3133   switch (MI->getOpcode()) {
3134   case AMDGPU::S_ENDPGM:
3135   case AMDGPU::S_ENDPGM_SAVED:
3136   case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3137   case AMDGPU::SI_RETURN_TO_EPILOG:
3138     // Ensure shader with calls raises priority at entry.
3139     // This ensures correct priority if exports exist in callee.
3140     if (MF->getFrameInfo().hasCalls())
3141       return ensureEntrySetPrio(MF, NormalPriority, TII);
3142     return false;
3143   case AMDGPU::S_SETPRIO: {
3144     // Raise minimum priority unless in workaround.
3145     auto &PrioOp = MI->getOperand(0);
3146     int Prio = PrioOp.getImm();
3147     bool InWA = (Prio == PostExportPriority) &&
3148                 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3149     if (InWA || Prio >= NormalPriority)
3150       return false;
3151     PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3152     return true;
3153   }
3154   default:
3155     if (!TII.isEXP(*MI))
3156       return false;
3157     break;
3158   }
3159 
3160   // Check entry priority at each export (as there will only be a few).
3161   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3162   bool Changed = false;
3163   if (CC != CallingConv::AMDGPU_Gfx)
3164     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3165 
3166   auto NextMI = std::next(It);
3167   bool EndOfShader = false;
3168   if (NextMI != MBB->end()) {
3169     // Only need WA at end of sequence of exports.
3170     if (TII.isEXP(*NextMI))
3171       return Changed;
3172     // Assume appropriate S_SETPRIO after export means WA already applied.
3173     if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3174         NextMI->getOperand(0).getImm() == PostExportPriority)
3175       return Changed;
3176     EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3177   }
3178 
3179   const DebugLoc &DL = MI->getDebugLoc();
3180 
3181   // Lower priority.
3182   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3183       .addImm(PostExportPriority);
3184 
3185   if (!EndOfShader) {
3186     // Wait for exports to complete.
3187     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3188         .addReg(AMDGPU::SGPR_NULL)
3189         .addImm(0);
3190   }
3191 
3192   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3193   BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3194 
3195   if (!EndOfShader) {
3196     // Return to normal (higher) priority.
3197     BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3198         .addImm(NormalPriority);
3199   }
3200 
3201   return true;
3202 }
3203