xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 179219ea046f46927d6478d43431e8b541703539)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19 
20 using namespace llvm;
21 
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25 
26 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
27   IsHazardRecognizerMode(false),
28   CurrCycleInstr(nullptr),
29   MF(MF),
30   ST(MF.getSubtarget<GCNSubtarget>()),
31   TII(*ST.getInstrInfo()),
32   TRI(TII.getRegisterInfo()),
33   ClauseUses(TRI.getNumRegUnits()),
34   ClauseDefs(TRI.getNumRegUnits()) {
35   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
36   TSchedModel.init(&ST);
37 }
38 
39 void GCNHazardRecognizer::Reset() {
40   EmittedInstrs.clear();
41 }
42 
43 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
44   EmitInstruction(SU->getInstr());
45 }
46 
47 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
48   CurrCycleInstr = MI;
49 }
50 
51 static bool isDivFMas(unsigned Opcode) {
52   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
53 }
54 
55 static bool isSGetReg(unsigned Opcode) {
56   return Opcode == AMDGPU::S_GETREG_B32;
57 }
58 
59 static bool isSSetReg(unsigned Opcode) {
60   switch (Opcode) {
61   case AMDGPU::S_SETREG_B32:
62   case AMDGPU::S_SETREG_B32_mode:
63   case AMDGPU::S_SETREG_IMM32_B32:
64   case AMDGPU::S_SETREG_IMM32_B32_mode:
65     return true;
66   }
67   return false;
68 }
69 
70 static bool isRWLane(unsigned Opcode) {
71   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
72 }
73 
74 static bool isRFE(unsigned Opcode) {
75   return Opcode == AMDGPU::S_RFE_B64;
76 }
77 
78 static bool isSMovRel(unsigned Opcode) {
79   switch (Opcode) {
80   case AMDGPU::S_MOVRELS_B32:
81   case AMDGPU::S_MOVRELS_B64:
82   case AMDGPU::S_MOVRELD_B32:
83   case AMDGPU::S_MOVRELD_B64:
84     return true;
85   default:
86     return false;
87   }
88 }
89 
90 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
91                                     const MachineInstr &MI) {
92   if (TII.isAlwaysGDS(MI.getOpcode()))
93     return true;
94 
95   switch (MI.getOpcode()) {
96   case AMDGPU::S_SENDMSG:
97   case AMDGPU::S_SENDMSGHALT:
98   case AMDGPU::S_TTRACEDATA:
99     return true;
100   // These DS opcodes don't support GDS.
101   case AMDGPU::DS_NOP:
102   case AMDGPU::DS_PERMUTE_B32:
103   case AMDGPU::DS_BPERMUTE_B32:
104     return false;
105   default:
106     if (TII.isDS(MI.getOpcode())) {
107       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
108                                            AMDGPU::OpName::gds);
109       if (MI.getOperand(GDS).getImm())
110         return true;
111     }
112     return false;
113   }
114 }
115 
116 static bool isPermlane(const MachineInstr &MI) {
117   unsigned Opcode = MI.getOpcode();
118   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
119          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
120 }
121 
122 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
123   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
124                                                      AMDGPU::OpName::simm16);
125   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
126 }
127 
128 ScheduleHazardRecognizer::HazardType
129 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
130   MachineInstr *MI = SU->getInstr();
131   // If we are not in "HazardRecognizerMode" and therefore not being run from
132   // the scheduler, track possible stalls from hazards but don't insert noops.
133   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
134 
135   if (MI->isBundle())
136    return NoHazard;
137 
138   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
139     return HazardType;
140 
141   // FIXME: Should flat be considered vmem?
142   if ((SIInstrInfo::isVMEM(*MI) ||
143        SIInstrInfo::isFLAT(*MI))
144       && checkVMEMHazards(MI) > 0)
145     return HazardType;
146 
147   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
148     return HazardType;
149 
150   if (checkFPAtomicToDenormModeHazard(MI) > 0)
151     return HazardType;
152 
153   if (ST.hasNoDataDepHazard())
154     return NoHazard;
155 
156   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
157     return HazardType;
158 
159   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
160     return HazardType;
161 
162   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
163     return HazardType;
164 
165   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
166     return HazardType;
167 
168   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
169     return HazardType;
170 
171   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
172     return HazardType;
173 
174   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
175     return HazardType;
176 
177   if (ST.hasReadM0MovRelInterpHazard() &&
178       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
179       checkReadM0Hazards(MI) > 0)
180     return HazardType;
181 
182   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
183       checkReadM0Hazards(MI) > 0)
184     return HazardType;
185 
186   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
187     return HazardType;
188 
189   if ((SIInstrInfo::isVMEM(*MI) ||
190        SIInstrInfo::isFLAT(*MI) ||
191        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
192     return HazardType;
193 
194   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
195     return HazardType;
196 
197   return NoHazard;
198 }
199 
200 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
201                                 unsigned Quantity) {
202   while (Quantity > 0) {
203     unsigned Arg = std::min(Quantity, 8u);
204     Quantity -= Arg;
205     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
206         .addImm(Arg - 1);
207   }
208 }
209 
210 void GCNHazardRecognizer::processBundle() {
211   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
212   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
213   // Check bundled MachineInstr's for hazards.
214   for (; MI != E && MI->isInsideBundle(); ++MI) {
215     CurrCycleInstr = &*MI;
216     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
217 
218     if (IsHazardRecognizerMode) {
219       fixHazards(CurrCycleInstr);
220 
221       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
222     }
223 
224     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
225     // include the bundled MI directly after, only add a maximum of
226     // (MaxLookAhead - 1) noops to EmittedInstrs.
227     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
228       EmittedInstrs.push_front(nullptr);
229 
230     EmittedInstrs.push_front(CurrCycleInstr);
231     EmittedInstrs.resize(MaxLookAhead);
232   }
233   CurrCycleInstr = nullptr;
234 }
235 
236 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
237   IsHazardRecognizerMode = true;
238   CurrCycleInstr = MI;
239   unsigned W = PreEmitNoopsCommon(MI);
240   fixHazards(MI);
241   CurrCycleInstr = nullptr;
242   return W;
243 }
244 
245 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
246   if (MI->isBundle())
247     return 0;
248 
249   int WaitStates = 0;
250 
251   if (SIInstrInfo::isSMRD(*MI))
252     return std::max(WaitStates, checkSMRDHazards(MI));
253 
254   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
255     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
256 
257   if (ST.hasNSAtoVMEMBug())
258     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
259 
260   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
261 
262   if (ST.hasNoDataDepHazard())
263     return WaitStates;
264 
265   if (SIInstrInfo::isVALU(*MI))
266     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
267 
268   if (SIInstrInfo::isDPP(*MI))
269     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
270 
271   if (isDivFMas(MI->getOpcode()))
272     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
273 
274   if (isRWLane(MI->getOpcode()))
275     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
276 
277   if (MI->isInlineAsm())
278     return std::max(WaitStates, checkInlineAsmHazards(MI));
279 
280   if (isSGetReg(MI->getOpcode()))
281     return std::max(WaitStates, checkGetRegHazards(MI));
282 
283   if (isSSetReg(MI->getOpcode()))
284     return std::max(WaitStates, checkSetRegHazards(MI));
285 
286   if (isRFE(MI->getOpcode()))
287     return std::max(WaitStates, checkRFEHazards(MI));
288 
289   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
290                                            isSMovRel(MI->getOpcode())))
291     return std::max(WaitStates, checkReadM0Hazards(MI));
292 
293   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
294     return std::max(WaitStates, checkReadM0Hazards(MI));
295 
296   if (SIInstrInfo::isMAI(*MI))
297     return std::max(WaitStates, checkMAIHazards(MI));
298 
299   if (SIInstrInfo::isVMEM(*MI) ||
300       SIInstrInfo::isFLAT(*MI) ||
301       SIInstrInfo::isDS(*MI))
302     return std::max(WaitStates, checkMAILdStHazards(MI));
303 
304   return WaitStates;
305 }
306 
307 void GCNHazardRecognizer::EmitNoop() {
308   EmittedInstrs.push_front(nullptr);
309 }
310 
311 void GCNHazardRecognizer::AdvanceCycle() {
312   // When the scheduler detects a stall, it will call AdvanceCycle() without
313   // emitting any instructions.
314   if (!CurrCycleInstr) {
315     EmittedInstrs.push_front(nullptr);
316     return;
317   }
318 
319   // Do not track non-instructions which do not affect the wait states.
320   // If included, these instructions can lead to buffer overflow such that
321   // detectable hazards are missed.
322   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
323       CurrCycleInstr->isKill()) {
324     CurrCycleInstr = nullptr;
325     return;
326   }
327 
328   if (CurrCycleInstr->isBundle()) {
329     processBundle();
330     return;
331   }
332 
333   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
334 
335   // Keep track of emitted instructions
336   EmittedInstrs.push_front(CurrCycleInstr);
337 
338   // Add a nullptr for each additional wait state after the first.  Make sure
339   // not to add more than getMaxLookAhead() items to the list, since we
340   // truncate the list to that size right after this loop.
341   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
342        i < e; ++i) {
343     EmittedInstrs.push_front(nullptr);
344   }
345 
346   // getMaxLookahead() is the largest number of wait states we will ever need
347   // to insert, so there is no point in keeping track of more than that many
348   // wait states.
349   EmittedInstrs.resize(getMaxLookAhead());
350 
351   CurrCycleInstr = nullptr;
352 }
353 
354 void GCNHazardRecognizer::RecedeCycle() {
355   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
356 }
357 
358 //===----------------------------------------------------------------------===//
359 // Helper Functions
360 //===----------------------------------------------------------------------===//
361 
362 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
363 
364 // Returns a minimum wait states since \p I walking all predecessors.
365 // Only scans until \p IsExpired does not return true.
366 // Can only be run in a hazard recognizer mode.
367 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
368                               MachineBasicBlock *MBB,
369                               MachineBasicBlock::reverse_instr_iterator I,
370                               int WaitStates,
371                               IsExpiredFn IsExpired,
372                               DenseSet<const MachineBasicBlock *> &Visited) {
373   for (auto E = MBB->instr_rend(); I != E; ++I) {
374     // Don't add WaitStates for parent BUNDLE instructions.
375     if (I->isBundle())
376       continue;
377 
378     if (IsHazard(&*I))
379       return WaitStates;
380 
381     if (I->isInlineAsm() || I->isMetaInstruction())
382       continue;
383 
384     WaitStates += SIInstrInfo::getNumWaitStates(*I);
385 
386     if (IsExpired(&*I, WaitStates))
387       return std::numeric_limits<int>::max();
388   }
389 
390   int MinWaitStates = WaitStates;
391   bool Found = false;
392   for (MachineBasicBlock *Pred : MBB->predecessors()) {
393     if (!Visited.insert(Pred).second)
394       continue;
395 
396     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
397                                WaitStates, IsExpired, Visited);
398 
399     if (W == std::numeric_limits<int>::max())
400       continue;
401 
402     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
403     if (IsExpired(nullptr, MinWaitStates))
404       return MinWaitStates;
405 
406     Found = true;
407   }
408 
409   if (Found)
410     return MinWaitStates;
411 
412   return std::numeric_limits<int>::max();
413 }
414 
415 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
416                               MachineInstr *MI,
417                               IsExpiredFn IsExpired) {
418   DenseSet<const MachineBasicBlock *> Visited;
419   return getWaitStatesSince(IsHazard, MI->getParent(),
420                             std::next(MI->getReverseIterator()),
421                             0, IsExpired, Visited);
422 }
423 
424 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
425   if (IsHazardRecognizerMode) {
426     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
427       return WaitStates >= Limit;
428     };
429     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
430   }
431 
432   int WaitStates = 0;
433   for (MachineInstr *MI : EmittedInstrs) {
434     if (MI) {
435       if (IsHazard(MI))
436         return WaitStates;
437 
438       if (MI->isInlineAsm())
439         continue;
440     }
441     ++WaitStates;
442 
443     if (WaitStates >= Limit)
444       break;
445   }
446   return std::numeric_limits<int>::max();
447 }
448 
449 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
450                                                IsHazardFn IsHazardDef,
451                                                int Limit) {
452   const SIRegisterInfo *TRI = ST.getRegisterInfo();
453 
454   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
455     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
456   };
457 
458   return getWaitStatesSince(IsHazardFn, Limit);
459 }
460 
461 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
462                                                   int Limit) {
463   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
464     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
465   };
466 
467   return getWaitStatesSince(IsHazardFn, Limit);
468 }
469 
470 //===----------------------------------------------------------------------===//
471 // No-op Hazard Detection
472 //===----------------------------------------------------------------------===//
473 
474 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
475                         MCRegister Reg) {
476   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
477     BV.set(*RUI);
478 }
479 
480 static void addRegsToSet(const SIRegisterInfo &TRI,
481                          iterator_range<MachineInstr::const_mop_iterator> Ops,
482                          BitVector &Set) {
483   for (const MachineOperand &Op : Ops) {
484     if (Op.isReg())
485       addRegUnits(TRI, Set, Op.getReg().asMCReg());
486   }
487 }
488 
489 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
490   // XXX: Do we need to worry about implicit operands
491   addRegsToSet(TRI, MI.defs(), ClauseDefs);
492   addRegsToSet(TRI, MI.uses(), ClauseUses);
493 }
494 
495 static bool breaksSMEMSoftClause(MachineInstr *MI) {
496   return !SIInstrInfo::isSMRD(*MI);
497 }
498 
499 static bool breaksVMEMSoftClause(MachineInstr *MI) {
500   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
501 }
502 
503 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
504   // SMEM soft clause are only present on VI+, and only matter if xnack is
505   // enabled.
506   if (!ST.isXNACKEnabled())
507     return 0;
508 
509   bool IsSMRD = TII.isSMRD(*MEM);
510 
511   resetClause();
512 
513   // A soft-clause is any group of consecutive SMEM instructions.  The
514   // instructions in this group may return out of order and/or may be
515   // replayed (i.e. the same instruction issued more than once).
516   //
517   // In order to handle these situations correctly we need to make sure that
518   // when a clause has more than one instruction, no instruction in the clause
519   // writes to a register that is read by another instruction in the clause
520   // (including itself). If we encounter this situaion, we need to break the
521   // clause by inserting a non SMEM instruction.
522 
523   for (MachineInstr *MI : EmittedInstrs) {
524     // When we hit a non-SMEM instruction then we have passed the start of the
525     // clause and we can stop.
526     if (!MI)
527       break;
528 
529     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
530       break;
531 
532     addClauseInst(*MI);
533   }
534 
535   if (ClauseDefs.none())
536     return 0;
537 
538   // We need to make sure not to put loads and stores in the same clause if they
539   // use the same address. For now, just start a new clause whenever we see a
540   // store.
541   if (MEM->mayStore())
542     return 1;
543 
544   addClauseInst(*MEM);
545 
546   // If the set of defs and uses intersect then we cannot add this instruction
547   // to the clause, so we have a hazard.
548   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
549 }
550 
551 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
552   int WaitStatesNeeded = 0;
553 
554   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
555 
556   // This SMRD hazard only affects SI.
557   if (!ST.hasSMRDReadVALUDefHazard())
558     return WaitStatesNeeded;
559 
560   // A read of an SGPR by SMRD instruction requires 4 wait states when the
561   // SGPR was written by a VALU instruction.
562   int SmrdSgprWaitStates = 4;
563   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
564   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
565 
566   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
567 
568   for (const MachineOperand &Use : SMRD->uses()) {
569     if (!Use.isReg())
570       continue;
571     int WaitStatesNeededForUse =
572         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
573                                                    SmrdSgprWaitStates);
574     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
575 
576     // This fixes what appears to be undocumented hardware behavior in SI where
577     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
578     // needs some number of nops in between. We don't know how many we need, but
579     // let's use 4. This wasn't discovered before probably because the only
580     // case when this happens is when we expand a 64-bit pointer into a full
581     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
582     // probably never encountered in the closed-source land.
583     if (IsBufferSMRD) {
584       int WaitStatesNeededForUse =
585         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
586                                                    IsBufferHazardDefFn,
587                                                    SmrdSgprWaitStates);
588       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
589     }
590   }
591 
592   return WaitStatesNeeded;
593 }
594 
595 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
596   if (!ST.hasVMEMReadSGPRVALUDefHazard())
597     return 0;
598 
599   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
600 
601   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
602   // SGPR was written by a VALU Instruction.
603   const int VmemSgprWaitStates = 5;
604   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
605   for (const MachineOperand &Use : VMEM->uses()) {
606     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
607       continue;
608 
609     int WaitStatesNeededForUse =
610         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
611                                                    VmemSgprWaitStates);
612     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
613   }
614   return WaitStatesNeeded;
615 }
616 
617 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
618   const SIRegisterInfo *TRI = ST.getRegisterInfo();
619   const SIInstrInfo *TII = ST.getInstrInfo();
620 
621   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
622   int DppVgprWaitStates = 2;
623   int DppExecWaitStates = 5;
624   int WaitStatesNeeded = 0;
625   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
626 
627   for (const MachineOperand &Use : DPP->uses()) {
628     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
629       continue;
630     int WaitStatesNeededForUse =
631         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
632                               [](MachineInstr *) { return true; },
633                               DppVgprWaitStates);
634     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
635   }
636 
637   WaitStatesNeeded = std::max(
638       WaitStatesNeeded,
639       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
640                                                 DppExecWaitStates));
641 
642   return WaitStatesNeeded;
643 }
644 
645 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
646   const SIInstrInfo *TII = ST.getInstrInfo();
647 
648   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
649   // instruction.
650   const int DivFMasWaitStates = 4;
651   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
652   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
653                                                DivFMasWaitStates);
654 
655   return DivFMasWaitStates - WaitStatesNeeded;
656 }
657 
658 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
659   const SIInstrInfo *TII = ST.getInstrInfo();
660   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
661 
662   const int GetRegWaitStates = 2;
663   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
664     return GetRegHWReg == getHWReg(TII, *MI);
665   };
666   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
667 
668   return GetRegWaitStates - WaitStatesNeeded;
669 }
670 
671 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
672   const SIInstrInfo *TII = ST.getInstrInfo();
673   unsigned HWReg = getHWReg(TII, *SetRegInstr);
674 
675   const int SetRegWaitStates = ST.getSetRegWaitStates();
676   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
677     return HWReg == getHWReg(TII, *MI);
678   };
679   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
680   return SetRegWaitStates - WaitStatesNeeded;
681 }
682 
683 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
684   if (!MI.mayStore())
685     return -1;
686 
687   const SIInstrInfo *TII = ST.getInstrInfo();
688   unsigned Opcode = MI.getOpcode();
689   const MCInstrDesc &Desc = MI.getDesc();
690 
691   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
692   int VDataRCID = -1;
693   if (VDataIdx != -1)
694     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
695 
696   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
697     // There is no hazard if the instruction does not use vector regs
698     // (like wbinvl1)
699     if (VDataIdx == -1)
700       return -1;
701     // For MUBUF/MTBUF instructions this hazard only exists if the
702     // instruction is not using a register in the soffset field.
703     const MachineOperand *SOffset =
704         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
705     // If we have no soffset operand, then assume this field has been
706     // hardcoded to zero.
707     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
708         (!SOffset || !SOffset->isReg()))
709       return VDataIdx;
710   }
711 
712   // MIMG instructions create a hazard if they don't use a 256-bit T# and
713   // the store size is greater than 8 bytes and they have more than two bits
714   // of their dmask set.
715   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
716   if (TII->isMIMG(MI)) {
717     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
718     assert(SRsrcIdx != -1 &&
719            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
720     (void)SRsrcIdx;
721   }
722 
723   if (TII->isFLAT(MI)) {
724     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
725     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
726       return DataIdx;
727   }
728 
729   return -1;
730 }
731 
732 int
733 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
734                                             const MachineRegisterInfo &MRI) {
735   // Helper to check for the hazard where VMEM instructions that store more than
736   // 8 bytes can have there store data over written by the next instruction.
737   const SIRegisterInfo *TRI = ST.getRegisterInfo();
738 
739   const int VALUWaitStates = 1;
740   int WaitStatesNeeded = 0;
741 
742   if (!TRI->isVGPR(MRI, Def.getReg()))
743     return WaitStatesNeeded;
744   Register Reg = Def.getReg();
745   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
746     int DataIdx = createsVALUHazard(*MI);
747     return DataIdx >= 0 &&
748     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
749   };
750   int WaitStatesNeededForDef =
751     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
752   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
753 
754   return WaitStatesNeeded;
755 }
756 
757 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
758   // This checks for the hazard where VMEM instructions that store more than
759   // 8 bytes can have there store data over written by the next instruction.
760   if (!ST.has12DWordStoreHazard())
761     return 0;
762 
763   const MachineRegisterInfo &MRI = MF.getRegInfo();
764   int WaitStatesNeeded = 0;
765 
766   for (const MachineOperand &Def : VALU->defs()) {
767     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
768   }
769 
770   return WaitStatesNeeded;
771 }
772 
773 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
774   // This checks for hazards associated with inline asm statements.
775   // Since inline asms can contain just about anything, we use this
776   // to call/leverage other check*Hazard routines. Note that
777   // this function doesn't attempt to address all possible inline asm
778   // hazards (good luck), but is a collection of what has been
779   // problematic thus far.
780 
781   // see checkVALUHazards()
782   if (!ST.has12DWordStoreHazard())
783     return 0;
784 
785   const MachineRegisterInfo &MRI = MF.getRegInfo();
786   int WaitStatesNeeded = 0;
787 
788   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
789        I != E; ++I) {
790     const MachineOperand &Op = IA->getOperand(I);
791     if (Op.isReg() && Op.isDef()) {
792       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
793     }
794   }
795 
796   return WaitStatesNeeded;
797 }
798 
799 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
800   const SIInstrInfo *TII = ST.getInstrInfo();
801   const SIRegisterInfo *TRI = ST.getRegisterInfo();
802   const MachineRegisterInfo &MRI = MF.getRegInfo();
803 
804   const MachineOperand *LaneSelectOp =
805       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
806 
807   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
808     return 0;
809 
810   Register LaneSelectReg = LaneSelectOp->getReg();
811   auto IsHazardFn = [TII] (MachineInstr *MI) {
812     return TII->isVALU(*MI);
813   };
814 
815   const int RWLaneWaitStates = 4;
816   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
817                                               RWLaneWaitStates);
818   return RWLaneWaitStates - WaitStatesSince;
819 }
820 
821 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
822   if (!ST.hasRFEHazards())
823     return 0;
824 
825   const SIInstrInfo *TII = ST.getInstrInfo();
826 
827   const int RFEWaitStates = 1;
828 
829   auto IsHazardFn = [TII] (MachineInstr *MI) {
830     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
831   };
832   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
833   return RFEWaitStates - WaitStatesNeeded;
834 }
835 
836 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
837   const SIInstrInfo *TII = ST.getInstrInfo();
838   const int SMovRelWaitStates = 1;
839   auto IsHazardFn = [TII] (MachineInstr *MI) {
840     return TII->isSALU(*MI);
841   };
842   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
843                                                    SMovRelWaitStates);
844 }
845 
846 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
847   fixVMEMtoScalarWriteHazards(MI);
848   fixVcmpxPermlaneHazards(MI);
849   fixSMEMtoVectorWriteHazards(MI);
850   fixVcmpxExecWARHazard(MI);
851   fixLdsBranchVmemWARHazard(MI);
852 }
853 
854 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
855   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
856     return false;
857 
858   const SIInstrInfo *TII = ST.getInstrInfo();
859   auto IsHazardFn = [TII] (MachineInstr *MI) {
860     return TII->isVOPC(*MI);
861   };
862 
863   auto IsExpiredFn = [] (MachineInstr *MI, int) {
864     if (!MI)
865       return false;
866     unsigned Opc = MI->getOpcode();
867     return SIInstrInfo::isVALU(*MI) &&
868            Opc != AMDGPU::V_NOP_e32 &&
869            Opc != AMDGPU::V_NOP_e64 &&
870            Opc != AMDGPU::V_NOP_sdwa;
871   };
872 
873   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
874       std::numeric_limits<int>::max())
875     return false;
876 
877   // V_NOP will be discarded by SQ.
878   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
879   // which is always a VGPR and available.
880   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
881   Register Reg = Src0->getReg();
882   bool IsUndef = Src0->isUndef();
883   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
884           TII->get(AMDGPU::V_MOV_B32_e32))
885     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
886     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
887 
888   return true;
889 }
890 
891 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
892   if (!ST.hasVMEMtoScalarWriteHazard())
893     return false;
894 
895   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
896     return false;
897 
898   if (MI->getNumDefs() == 0)
899     return false;
900 
901   const SIRegisterInfo *TRI = ST.getRegisterInfo();
902 
903   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
904     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
905         !SIInstrInfo::isFLAT(*I))
906       return false;
907 
908     for (const MachineOperand &Def : MI->defs()) {
909       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
910       if (!Op)
911         continue;
912       return true;
913     }
914     return false;
915   };
916 
917   auto IsExpiredFn = [](MachineInstr *MI, int) {
918     return MI && (SIInstrInfo::isVALU(*MI) ||
919                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
920                    !MI->getOperand(0).getImm()) ||
921                   (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
922                    MI->getOperand(0).getImm() == 0xffe3));
923   };
924 
925   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
926       std::numeric_limits<int>::max())
927     return false;
928 
929   const SIInstrInfo *TII = ST.getInstrInfo();
930   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
931           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
932       .addImm(0xffe3);
933   return true;
934 }
935 
936 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
937   if (!ST.hasSMEMtoVectorWriteHazard())
938     return false;
939 
940   if (!SIInstrInfo::isVALU(*MI))
941     return false;
942 
943   unsigned SDSTName;
944   switch (MI->getOpcode()) {
945   case AMDGPU::V_READLANE_B32:
946   case AMDGPU::V_READFIRSTLANE_B32:
947     SDSTName = AMDGPU::OpName::vdst;
948     break;
949   default:
950     SDSTName = AMDGPU::OpName::sdst;
951     break;
952   }
953 
954   const SIInstrInfo *TII = ST.getInstrInfo();
955   const SIRegisterInfo *TRI = ST.getRegisterInfo();
956   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
957   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
958   if (!SDST) {
959     for (const auto &MO : MI->implicit_operands()) {
960       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
961         SDST = &MO;
962         break;
963       }
964     }
965   }
966 
967   if (!SDST)
968     return false;
969 
970   const Register SDSTReg = SDST->getReg();
971   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
972     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
973   };
974 
975   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
976     if (MI) {
977       if (TII->isSALU(*MI)) {
978         switch (MI->getOpcode()) {
979         case AMDGPU::S_SETVSKIP:
980         case AMDGPU::S_VERSION:
981         case AMDGPU::S_WAITCNT_VSCNT:
982         case AMDGPU::S_WAITCNT_VMCNT:
983         case AMDGPU::S_WAITCNT_EXPCNT:
984           // These instructions cannot not mitigate the hazard.
985           return false;
986         case AMDGPU::S_WAITCNT_LGKMCNT:
987           // Reducing lgkmcnt count to 0 always mitigates the hazard.
988           return (MI->getOperand(1).getImm() == 0) &&
989                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
990         case AMDGPU::S_WAITCNT: {
991           const int64_t Imm = MI->getOperand(0).getImm();
992           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
993           return (Decoded.LgkmCnt == 0);
994         }
995         default:
996           // SOPP instructions cannot mitigate the hazard.
997           if (TII->isSOPP(*MI))
998             return false;
999           // At this point the SALU can be assumed to mitigate the hazard
1000           // because either:
1001           // (a) it is independent of the at risk SMEM (breaking chain),
1002           // or
1003           // (b) it is dependent on the SMEM, in which case an appropriate
1004           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1005           //     SMEM instruction.
1006           return true;
1007         }
1008       }
1009     }
1010     return false;
1011   };
1012 
1013   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1014       std::numeric_limits<int>::max())
1015     return false;
1016 
1017   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1018           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1019       .addImm(0);
1020   return true;
1021 }
1022 
1023 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1024   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1025     return false;
1026 
1027   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1028   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1029     return false;
1030 
1031   auto IsHazardFn = [TRI] (MachineInstr *I) {
1032     if (SIInstrInfo::isVALU(*I))
1033       return false;
1034     return I->readsRegister(AMDGPU::EXEC, TRI);
1035   };
1036 
1037   const SIInstrInfo *TII = ST.getInstrInfo();
1038   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1039     if (!MI)
1040       return false;
1041     if (SIInstrInfo::isVALU(*MI)) {
1042       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1043         return true;
1044       for (auto MO : MI->implicit_operands())
1045         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1046           return true;
1047     }
1048     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1049         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1050       return true;
1051     return false;
1052   };
1053 
1054   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1055       std::numeric_limits<int>::max())
1056     return false;
1057 
1058   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1059           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1060     .addImm(0xfffe);
1061   return true;
1062 }
1063 
1064 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1065   if (!ST.hasLdsBranchVmemWARHazard())
1066     return false;
1067 
1068   auto IsHazardInst = [] (const MachineInstr *MI) {
1069     if (SIInstrInfo::isDS(*MI))
1070       return 1;
1071     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1072       return 2;
1073     return 0;
1074   };
1075 
1076   auto InstType = IsHazardInst(MI);
1077   if (!InstType)
1078     return false;
1079 
1080   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1081     return I && (IsHazardInst(I) ||
1082                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1083                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1084                   !I->getOperand(1).getImm()));
1085   };
1086 
1087   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1088     if (!I->isBranch())
1089       return false;
1090 
1091     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1092       auto InstType2 = IsHazardInst(I);
1093       return InstType2 && InstType != InstType2;
1094     };
1095 
1096     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1097       if (!I)
1098         return false;
1099 
1100       auto InstType2 = IsHazardInst(I);
1101       if (InstType == InstType2)
1102         return true;
1103 
1104       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1105              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1106              !I->getOperand(1).getImm();
1107     };
1108 
1109     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1110            std::numeric_limits<int>::max();
1111   };
1112 
1113   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1114       std::numeric_limits<int>::max())
1115     return false;
1116 
1117   const SIInstrInfo *TII = ST.getInstrInfo();
1118   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1119           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1120     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1121     .addImm(0);
1122 
1123   return true;
1124 }
1125 
1126 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1127   int NSAtoVMEMWaitStates = 1;
1128 
1129   if (!ST.hasNSAtoVMEMBug())
1130     return 0;
1131 
1132   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1133     return 0;
1134 
1135   const SIInstrInfo *TII = ST.getInstrInfo();
1136   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1137   if (!Offset || (Offset->getImm() & 6) == 0)
1138     return 0;
1139 
1140   auto IsHazardFn = [TII] (MachineInstr *I) {
1141     if (!SIInstrInfo::isMIMG(*I))
1142       return false;
1143     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1144     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1145            TII->getInstSizeInBytes(*I) >= 16;
1146   };
1147 
1148   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1149 }
1150 
1151 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1152   int FPAtomicToDenormModeWaitStates = 3;
1153 
1154   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1155     return 0;
1156 
1157   auto IsHazardFn = [] (MachineInstr *I) {
1158     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1159       return false;
1160     return SIInstrInfo::isFPAtomic(*I);
1161   };
1162 
1163   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1164     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1165       return true;
1166 
1167     switch (MI->getOpcode()) {
1168     case AMDGPU::S_WAITCNT:
1169     case AMDGPU::S_WAITCNT_VSCNT:
1170     case AMDGPU::S_WAITCNT_VMCNT:
1171     case AMDGPU::S_WAITCNT_EXPCNT:
1172     case AMDGPU::S_WAITCNT_LGKMCNT:
1173     case AMDGPU::S_WAIT_IDLE:
1174       return true;
1175     default:
1176       break;
1177     }
1178 
1179     return false;
1180   };
1181 
1182 
1183   return FPAtomicToDenormModeWaitStates -
1184          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1185 }
1186 
1187 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1188   assert(SIInstrInfo::isMAI(*MI));
1189 
1190   int WaitStatesNeeded = 0;
1191   unsigned Opc = MI->getOpcode();
1192 
1193   auto IsVALUFn = [] (MachineInstr *MI) {
1194     return SIInstrInfo::isVALU(*MI);
1195   };
1196 
1197   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1198     const int LegacyVALUWritesVGPRWaitStates = 2;
1199     const int VALUWritesExecWaitStates = 4;
1200     const int MaxWaitStates = 4;
1201 
1202     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1203       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1204     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1205 
1206     if (WaitStatesNeeded < MaxWaitStates) {
1207       for (const MachineOperand &Use : MI->explicit_uses()) {
1208         const int MaxWaitStates = 2;
1209 
1210         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1211           continue;
1212 
1213         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1214           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1215         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1216 
1217         if (WaitStatesNeeded == MaxWaitStates)
1218           break;
1219       }
1220     }
1221   }
1222 
1223   auto IsMFMAFn = [] (MachineInstr *MI) {
1224     return SIInstrInfo::isMAI(*MI) &&
1225            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1226            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1227   };
1228 
1229   for (const MachineOperand &Op : MI->explicit_operands()) {
1230     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1231       continue;
1232 
1233     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1234       continue;
1235 
1236     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1237     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1238     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1239     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1240     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1241     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1242     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1243     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1244     const int MaxWaitStates = 18;
1245     Register Reg = Op.getReg();
1246     unsigned HazardDefLatency = 0;
1247 
1248     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1249                               (MachineInstr *MI) {
1250       if (!IsMFMAFn(MI))
1251         return false;
1252       Register DstReg = MI->getOperand(0).getReg();
1253       if (DstReg == Reg)
1254         return false;
1255       HazardDefLatency = std::max(HazardDefLatency,
1256                                   TSchedModel.computeInstrLatency(MI));
1257       return TRI.regsOverlap(DstReg, Reg);
1258     };
1259 
1260     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1261                                                    MaxWaitStates);
1262     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1263     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1264     int OpNo = MI->getOperandNo(&Op);
1265     if (OpNo == SrcCIdx) {
1266       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1267     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1268       switch (HazardDefLatency) {
1269       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1270                break;
1271       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1272                break;
1273       case 16: LLVM_FALLTHROUGH;
1274       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1275                break;
1276       }
1277     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1278       switch (HazardDefLatency) {
1279       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1280                break;
1281       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1282                break;
1283       case 16: LLVM_FALLTHROUGH;
1284       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1285                break;
1286       }
1287     }
1288 
1289     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1290     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1291 
1292     if (WaitStatesNeeded == MaxWaitStates)
1293       return WaitStatesNeeded; // Early exit.
1294 
1295     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1296       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1297         return false;
1298       Register DstReg = MI->getOperand(0).getReg();
1299       return TRI.regsOverlap(Reg, DstReg);
1300     };
1301 
1302     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1303     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1304     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1305     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1306     if (OpNo == SrcCIdx)
1307       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1308     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1309       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1310 
1311     WaitStatesNeededForUse = NeedWaitStates -
1312       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1313     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1314 
1315     if (WaitStatesNeeded == MaxWaitStates)
1316       return WaitStatesNeeded; // Early exit.
1317   }
1318 
1319   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1320     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1321     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1322     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1323     const int MaxWaitStates = 13;
1324     Register DstReg = MI->getOperand(0).getReg();
1325     unsigned HazardDefLatency = 0;
1326 
1327     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1328                          (MachineInstr *MI) {
1329       if (!IsMFMAFn(MI))
1330         return false;
1331       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1332       HazardDefLatency = std::max(HazardDefLatency,
1333                                   TSchedModel.computeInstrLatency(MI));
1334       return TRI.regsOverlap(Reg, DstReg);
1335     };
1336 
1337     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1338     int NeedWaitStates;
1339     switch (HazardDefLatency) {
1340     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1341              break;
1342     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1343              break;
1344     case 16: LLVM_FALLTHROUGH;
1345     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1346              break;
1347     }
1348 
1349     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1350     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1351   }
1352 
1353   return WaitStatesNeeded;
1354 }
1355 
1356 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1357   if (!ST.hasMAIInsts())
1358     return 0;
1359 
1360   int WaitStatesNeeded = 0;
1361 
1362   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1363     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1364   };
1365 
1366   for (const MachineOperand &Op : MI->explicit_uses()) {
1367     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1368       continue;
1369 
1370     Register Reg = Op.getReg();
1371 
1372     const int AccVgprReadLdStWaitStates = 2;
1373     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1374     const int MaxWaitStates = 2;
1375 
1376     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1377       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1378     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1379 
1380     if (WaitStatesNeeded == MaxWaitStates)
1381       return WaitStatesNeeded; // Early exit.
1382 
1383     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
1384       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1385           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1386         return false;
1387       auto IsVALUFn = [] (MachineInstr *MI) {
1388         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1389       };
1390       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1391              std::numeric_limits<int>::max();
1392     };
1393 
1394     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1395       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1396     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1397   }
1398 
1399   return WaitStatesNeeded;
1400 }
1401 
1402 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1403   if (!SU->isInstr())
1404     return false;
1405 
1406   MachineInstr *MAI = nullptr;
1407   auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
1408     MAI = nullptr;
1409     if (SIInstrInfo::isMAI(*MI) &&
1410         MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1411         MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1412       MAI = MI;
1413     return MAI != nullptr;
1414   };
1415 
1416   MachineInstr *MI = SU->getInstr();
1417   if (IsMFMAFn(MI)) {
1418     int W = getWaitStatesSince(IsMFMAFn, 16);
1419     if (MAI)
1420       return W < (int)TSchedModel.computeInstrLatency(MAI);
1421   }
1422 
1423   return false;
1424 }
1425