xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision e2eeea75eb8b6dd50c1298067a0655880d186734)
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33 
34 using namespace llvm;
35 
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39 
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41   IsHazardRecognizerMode(false),
42   CurrCycleInstr(nullptr),
43   MF(MF),
44   ST(MF.getSubtarget<GCNSubtarget>()),
45   TII(*ST.getInstrInfo()),
46   TRI(TII.getRegisterInfo()),
47   ClauseUses(TRI.getNumRegUnits()),
48   ClauseDefs(TRI.getNumRegUnits()) {
49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50   TSchedModel.init(&ST);
51 }
52 
53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54   EmitInstruction(SU->getInstr());
55 }
56 
57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58   CurrCycleInstr = MI;
59 }
60 
61 static bool isDivFMas(unsigned Opcode) {
62   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63 }
64 
65 static bool isSGetReg(unsigned Opcode) {
66   return Opcode == AMDGPU::S_GETREG_B32;
67 }
68 
69 static bool isSSetReg(unsigned Opcode) {
70   return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
71 }
72 
73 static bool isRWLane(unsigned Opcode) {
74   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
75 }
76 
77 static bool isRFE(unsigned Opcode) {
78   return Opcode == AMDGPU::S_RFE_B64;
79 }
80 
81 static bool isSMovRel(unsigned Opcode) {
82   switch (Opcode) {
83   case AMDGPU::S_MOVRELS_B32:
84   case AMDGPU::S_MOVRELS_B64:
85   case AMDGPU::S_MOVRELD_B32:
86   case AMDGPU::S_MOVRELD_B64:
87     return true;
88   default:
89     return false;
90   }
91 }
92 
93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
94                                     const MachineInstr &MI) {
95   if (TII.isAlwaysGDS(MI.getOpcode()))
96     return true;
97 
98   switch (MI.getOpcode()) {
99   case AMDGPU::S_SENDMSG:
100   case AMDGPU::S_SENDMSGHALT:
101   case AMDGPU::S_TTRACEDATA:
102     return true;
103   // These DS opcodes don't support GDS.
104   case AMDGPU::DS_NOP:
105   case AMDGPU::DS_PERMUTE_B32:
106   case AMDGPU::DS_BPERMUTE_B32:
107     return false;
108   default:
109     if (TII.isDS(MI.getOpcode())) {
110       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
111                                            AMDGPU::OpName::gds);
112       if (MI.getOperand(GDS).getImm())
113         return true;
114     }
115     return false;
116   }
117 }
118 
119 static bool isPermlane(const MachineInstr &MI) {
120   unsigned Opcode = MI.getOpcode();
121   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
122          Opcode == AMDGPU::V_PERMLANEX16_B32;
123 }
124 
125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
126   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
127                                                      AMDGPU::OpName::simm16);
128   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
129 }
130 
131 ScheduleHazardRecognizer::HazardType
132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
133   MachineInstr *MI = SU->getInstr();
134   if (MI->isBundle())
135    return NoHazard;
136 
137   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138     return NoopHazard;
139 
140   // FIXME: Should flat be considered vmem?
141   if ((SIInstrInfo::isVMEM(*MI) ||
142        SIInstrInfo::isFLAT(*MI))
143       && checkVMEMHazards(MI) > 0)
144     return NoopHazard;
145 
146   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147     return NoopHazard;
148 
149   if (checkFPAtomicToDenormModeHazard(MI) > 0)
150     return NoopHazard;
151 
152   if (ST.hasNoDataDepHazard())
153     return NoHazard;
154 
155   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156     return NoopHazard;
157 
158   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159     return NoopHazard;
160 
161   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162     return NoopHazard;
163 
164   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165     return NoopHazard;
166 
167   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168     return NoopHazard;
169 
170   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171     return NoopHazard;
172 
173   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174     return NoopHazard;
175 
176   if (ST.hasReadM0MovRelInterpHazard() &&
177       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178       checkReadM0Hazards(MI) > 0)
179     return NoopHazard;
180 
181   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182       checkReadM0Hazards(MI) > 0)
183     return NoopHazard;
184 
185   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186     return NoopHazard;
187 
188   if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
189     return NoopHazard;
190 
191   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
192     return NoopHazard;
193 
194   if (checkAnyInstHazards(MI) > 0)
195     return NoopHazard;
196 
197   return NoHazard;
198 }
199 
200 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
201   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
202       .addImm(0);
203 }
204 
205 void GCNHazardRecognizer::processBundle() {
206   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
207   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
208   // Check bundled MachineInstr's for hazards.
209   for (; MI != E && MI->isInsideBundle(); ++MI) {
210     CurrCycleInstr = &*MI;
211     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
212 
213     if (IsHazardRecognizerMode)
214       fixHazards(CurrCycleInstr);
215 
216     for (unsigned i = 0; i < WaitStates; ++i)
217       insertNoopInBundle(CurrCycleInstr, TII);
218 
219     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
220     // include the bundled MI directly after, only add a maximum of
221     // (MaxLookAhead - 1) noops to EmittedInstrs.
222     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
223       EmittedInstrs.push_front(nullptr);
224 
225     EmittedInstrs.push_front(CurrCycleInstr);
226     EmittedInstrs.resize(MaxLookAhead);
227   }
228   CurrCycleInstr = nullptr;
229 }
230 
231 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
232   IsHazardRecognizerMode = true;
233   CurrCycleInstr = MI;
234   unsigned W = PreEmitNoopsCommon(MI);
235   fixHazards(MI);
236   CurrCycleInstr = nullptr;
237   return W;
238 }
239 
240 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
241   if (MI->isBundle())
242     return 0;
243 
244   int WaitStates = std::max(0, checkAnyInstHazards(MI));
245 
246   if (SIInstrInfo::isSMRD(*MI))
247     return std::max(WaitStates, checkSMRDHazards(MI));
248 
249   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
250     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
251 
252   if (ST.hasNSAtoVMEMBug())
253     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
254 
255   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
256 
257   if (ST.hasNoDataDepHazard())
258     return WaitStates;
259 
260   if (SIInstrInfo::isVALU(*MI))
261     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
262 
263   if (SIInstrInfo::isDPP(*MI))
264     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
265 
266   if (isDivFMas(MI->getOpcode()))
267     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
268 
269   if (isRWLane(MI->getOpcode()))
270     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
271 
272   if (MI->isInlineAsm())
273     return std::max(WaitStates, checkInlineAsmHazards(MI));
274 
275   if (isSGetReg(MI->getOpcode()))
276     return std::max(WaitStates, checkGetRegHazards(MI));
277 
278   if (isSSetReg(MI->getOpcode()))
279     return std::max(WaitStates, checkSetRegHazards(MI));
280 
281   if (isRFE(MI->getOpcode()))
282     return std::max(WaitStates, checkRFEHazards(MI));
283 
284   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
285                                            isSMovRel(MI->getOpcode())))
286     return std::max(WaitStates, checkReadM0Hazards(MI));
287 
288   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
289     return std::max(WaitStates, checkReadM0Hazards(MI));
290 
291   if (SIInstrInfo::isMAI(*MI))
292     return std::max(WaitStates, checkMAIHazards(MI));
293 
294   if (MI->mayLoadOrStore())
295     return std::max(WaitStates, checkMAILdStHazards(MI));
296 
297   return WaitStates;
298 }
299 
300 void GCNHazardRecognizer::EmitNoop() {
301   EmittedInstrs.push_front(nullptr);
302 }
303 
304 void GCNHazardRecognizer::AdvanceCycle() {
305   // When the scheduler detects a stall, it will call AdvanceCycle() without
306   // emitting any instructions.
307   if (!CurrCycleInstr)
308     return;
309 
310   // Do not track non-instructions which do not affect the wait states.
311   // If included, these instructions can lead to buffer overflow such that
312   // detectable hazards are missed.
313   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
314       CurrCycleInstr->isKill())
315     return;
316 
317   if (CurrCycleInstr->isBundle()) {
318     processBundle();
319     return;
320   }
321 
322   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
323 
324   // Keep track of emitted instructions
325   EmittedInstrs.push_front(CurrCycleInstr);
326 
327   // Add a nullptr for each additional wait state after the first.  Make sure
328   // not to add more than getMaxLookAhead() items to the list, since we
329   // truncate the list to that size right after this loop.
330   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
331        i < e; ++i) {
332     EmittedInstrs.push_front(nullptr);
333   }
334 
335   // getMaxLookahead() is the largest number of wait states we will ever need
336   // to insert, so there is no point in keeping track of more than that many
337   // wait states.
338   EmittedInstrs.resize(getMaxLookAhead());
339 
340   CurrCycleInstr = nullptr;
341 }
342 
343 void GCNHazardRecognizer::RecedeCycle() {
344   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
345 }
346 
347 //===----------------------------------------------------------------------===//
348 // Helper Functions
349 //===----------------------------------------------------------------------===//
350 
351 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
352 
353 // Returns a minimum wait states since \p I walking all predecessors.
354 // Only scans until \p IsExpired does not return true.
355 // Can only be run in a hazard recognizer mode.
356 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
357                               MachineBasicBlock *MBB,
358                               MachineBasicBlock::reverse_instr_iterator I,
359                               int WaitStates,
360                               IsExpiredFn IsExpired,
361                               DenseSet<const MachineBasicBlock *> &Visited) {
362   for (auto E = MBB->instr_rend(); I != E; ++I) {
363     // Don't add WaitStates for parent BUNDLE instructions.
364     if (I->isBundle())
365       continue;
366 
367     if (IsHazard(&*I))
368       return WaitStates;
369 
370     if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
371       continue;
372 
373     WaitStates += SIInstrInfo::getNumWaitStates(*I);
374 
375     if (IsExpired(&*I, WaitStates))
376       return std::numeric_limits<int>::max();
377   }
378 
379   int MinWaitStates = WaitStates;
380   bool Found = false;
381   for (MachineBasicBlock *Pred : MBB->predecessors()) {
382     if (!Visited.insert(Pred).second)
383       continue;
384 
385     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
386                                WaitStates, IsExpired, Visited);
387 
388     if (W == std::numeric_limits<int>::max())
389       continue;
390 
391     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
392     if (IsExpired(nullptr, MinWaitStates))
393       return MinWaitStates;
394 
395     Found = true;
396   }
397 
398   if (Found)
399     return MinWaitStates;
400 
401   return std::numeric_limits<int>::max();
402 }
403 
404 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
405                               MachineInstr *MI,
406                               IsExpiredFn IsExpired) {
407   DenseSet<const MachineBasicBlock *> Visited;
408   return getWaitStatesSince(IsHazard, MI->getParent(),
409                             std::next(MI->getReverseIterator()),
410                             0, IsExpired, Visited);
411 }
412 
413 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
414   if (IsHazardRecognizerMode) {
415     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
416       return WaitStates >= Limit;
417     };
418     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
419   }
420 
421   int WaitStates = 0;
422   for (MachineInstr *MI : EmittedInstrs) {
423     if (MI) {
424       if (IsHazard(MI))
425         return WaitStates;
426 
427       if (MI->isInlineAsm())
428         continue;
429     }
430     ++WaitStates;
431 
432     if (WaitStates >= Limit)
433       break;
434   }
435   return std::numeric_limits<int>::max();
436 }
437 
438 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
439                                                IsHazardFn IsHazardDef,
440                                                int Limit) {
441   const SIRegisterInfo *TRI = ST.getRegisterInfo();
442 
443   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
444     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
445   };
446 
447   return getWaitStatesSince(IsHazardFn, Limit);
448 }
449 
450 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
451                                                   int Limit) {
452   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
453     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
454   };
455 
456   return getWaitStatesSince(IsHazardFn, Limit);
457 }
458 
459 //===----------------------------------------------------------------------===//
460 // No-op Hazard Detection
461 //===----------------------------------------------------------------------===//
462 
463 static void addRegUnits(const SIRegisterInfo &TRI,
464                         BitVector &BV, unsigned Reg) {
465   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
466     BV.set(*RUI);
467 }
468 
469 static void addRegsToSet(const SIRegisterInfo &TRI,
470                          iterator_range<MachineInstr::const_mop_iterator> Ops,
471                          BitVector &Set) {
472   for (const MachineOperand &Op : Ops) {
473     if (Op.isReg())
474       addRegUnits(TRI, Set, Op.getReg());
475   }
476 }
477 
478 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
479   // XXX: Do we need to worry about implicit operands
480   addRegsToSet(TRI, MI.defs(), ClauseDefs);
481   addRegsToSet(TRI, MI.uses(), ClauseUses);
482 }
483 
484 static bool breaksSMEMSoftClause(MachineInstr *MI) {
485   return !SIInstrInfo::isSMRD(*MI);
486 }
487 
488 static bool breaksVMEMSoftClause(MachineInstr *MI) {
489   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
490 }
491 
492 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
493   // SMEM soft clause are only present on VI+, and only matter if xnack is
494   // enabled.
495   if (!ST.isXNACKEnabled())
496     return 0;
497 
498   bool IsSMRD = TII.isSMRD(*MEM);
499 
500   resetClause();
501 
502   // A soft-clause is any group of consecutive SMEM instructions.  The
503   // instructions in this group may return out of order and/or may be
504   // replayed (i.e. the same instruction issued more than once).
505   //
506   // In order to handle these situations correctly we need to make sure that
507   // when a clause has more than one instruction, no instruction in the clause
508   // writes to a register that is read by another instruction in the clause
509   // (including itself). If we encounter this situaion, we need to break the
510   // clause by inserting a non SMEM instruction.
511 
512   for (MachineInstr *MI : EmittedInstrs) {
513     // When we hit a non-SMEM instruction then we have passed the start of the
514     // clause and we can stop.
515     if (!MI)
516       break;
517 
518     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
519       break;
520 
521     addClauseInst(*MI);
522   }
523 
524   if (ClauseDefs.none())
525     return 0;
526 
527   // We need to make sure not to put loads and stores in the same clause if they
528   // use the same address. For now, just start a new clause whenever we see a
529   // store.
530   if (MEM->mayStore())
531     return 1;
532 
533   addClauseInst(*MEM);
534 
535   // If the set of defs and uses intersect then we cannot add this instruction
536   // to the clause, so we have a hazard.
537   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
538 }
539 
540 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
541   int WaitStatesNeeded = 0;
542 
543   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
544 
545   // This SMRD hazard only affects SI.
546   if (!ST.hasSMRDReadVALUDefHazard())
547     return WaitStatesNeeded;
548 
549   // A read of an SGPR by SMRD instruction requires 4 wait states when the
550   // SGPR was written by a VALU instruction.
551   int SmrdSgprWaitStates = 4;
552   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
553   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
554 
555   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
556 
557   for (const MachineOperand &Use : SMRD->uses()) {
558     if (!Use.isReg())
559       continue;
560     int WaitStatesNeededForUse =
561         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
562                                                    SmrdSgprWaitStates);
563     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
564 
565     // This fixes what appears to be undocumented hardware behavior in SI where
566     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
567     // needs some number of nops in between. We don't know how many we need, but
568     // let's use 4. This wasn't discovered before probably because the only
569     // case when this happens is when we expand a 64-bit pointer into a full
570     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
571     // probably never encountered in the closed-source land.
572     if (IsBufferSMRD) {
573       int WaitStatesNeededForUse =
574         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
575                                                    IsBufferHazardDefFn,
576                                                    SmrdSgprWaitStates);
577       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
578     }
579   }
580 
581   return WaitStatesNeeded;
582 }
583 
584 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
585   if (!ST.hasVMEMReadSGPRVALUDefHazard())
586     return 0;
587 
588   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
589 
590   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
591   // SGPR was written by a VALU Instruction.
592   const int VmemSgprWaitStates = 5;
593   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
594   for (const MachineOperand &Use : VMEM->uses()) {
595     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
596       continue;
597 
598     int WaitStatesNeededForUse =
599         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
600                                                    VmemSgprWaitStates);
601     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
602   }
603   return WaitStatesNeeded;
604 }
605 
606 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
607   const SIRegisterInfo *TRI = ST.getRegisterInfo();
608   const SIInstrInfo *TII = ST.getInstrInfo();
609 
610   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
611   int DppVgprWaitStates = 2;
612   int DppExecWaitStates = 5;
613   int WaitStatesNeeded = 0;
614   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
615 
616   for (const MachineOperand &Use : DPP->uses()) {
617     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
618       continue;
619     int WaitStatesNeededForUse =
620         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
621                               [](MachineInstr *) { return true; },
622                               DppVgprWaitStates);
623     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
624   }
625 
626   WaitStatesNeeded = std::max(
627       WaitStatesNeeded,
628       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
629                                                 DppExecWaitStates));
630 
631   return WaitStatesNeeded;
632 }
633 
634 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
635   const SIInstrInfo *TII = ST.getInstrInfo();
636 
637   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
638   // instruction.
639   const int DivFMasWaitStates = 4;
640   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
641   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
642                                                DivFMasWaitStates);
643 
644   return DivFMasWaitStates - WaitStatesNeeded;
645 }
646 
647 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
648   const SIInstrInfo *TII = ST.getInstrInfo();
649   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
650 
651   const int GetRegWaitStates = 2;
652   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
653     return GetRegHWReg == getHWReg(TII, *MI);
654   };
655   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
656 
657   return GetRegWaitStates - WaitStatesNeeded;
658 }
659 
660 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
661   const SIInstrInfo *TII = ST.getInstrInfo();
662   unsigned HWReg = getHWReg(TII, *SetRegInstr);
663 
664   const int SetRegWaitStates = ST.getSetRegWaitStates();
665   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
666     return HWReg == getHWReg(TII, *MI);
667   };
668   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
669   return SetRegWaitStates - WaitStatesNeeded;
670 }
671 
672 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
673   if (!MI.mayStore())
674     return -1;
675 
676   const SIInstrInfo *TII = ST.getInstrInfo();
677   unsigned Opcode = MI.getOpcode();
678   const MCInstrDesc &Desc = MI.getDesc();
679 
680   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
681   int VDataRCID = -1;
682   if (VDataIdx != -1)
683     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
684 
685   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
686     // There is no hazard if the instruction does not use vector regs
687     // (like wbinvl1)
688     if (VDataIdx == -1)
689       return -1;
690     // For MUBUF/MTBUF instructions this hazard only exists if the
691     // instruction is not using a register in the soffset field.
692     const MachineOperand *SOffset =
693         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
694     // If we have no soffset operand, then assume this field has been
695     // hardcoded to zero.
696     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
697         (!SOffset || !SOffset->isReg()))
698       return VDataIdx;
699   }
700 
701   // MIMG instructions create a hazard if they don't use a 256-bit T# and
702   // the store size is greater than 8 bytes and they have more than two bits
703   // of their dmask set.
704   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
705   if (TII->isMIMG(MI)) {
706     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
707     assert(SRsrcIdx != -1 &&
708            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
709     (void)SRsrcIdx;
710   }
711 
712   if (TII->isFLAT(MI)) {
713     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
714     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
715       return DataIdx;
716   }
717 
718   return -1;
719 }
720 
721 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
722 						const MachineRegisterInfo &MRI) {
723   // Helper to check for the hazard where VMEM instructions that store more than
724   // 8 bytes can have there store data over written by the next instruction.
725   const SIRegisterInfo *TRI = ST.getRegisterInfo();
726 
727   const int VALUWaitStates = 1;
728   int WaitStatesNeeded = 0;
729 
730   if (!TRI->isVGPR(MRI, Def.getReg()))
731     return WaitStatesNeeded;
732   Register Reg = Def.getReg();
733   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
734     int DataIdx = createsVALUHazard(*MI);
735     return DataIdx >= 0 &&
736     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
737   };
738   int WaitStatesNeededForDef =
739     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
740   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
741 
742   return WaitStatesNeeded;
743 }
744 
745 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
746   // This checks for the hazard where VMEM instructions that store more than
747   // 8 bytes can have there store data over written by the next instruction.
748   if (!ST.has12DWordStoreHazard())
749     return 0;
750 
751   const MachineRegisterInfo &MRI = MF.getRegInfo();
752   int WaitStatesNeeded = 0;
753 
754   for (const MachineOperand &Def : VALU->defs()) {
755     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
756   }
757 
758   return WaitStatesNeeded;
759 }
760 
761 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
762   // This checks for hazards associated with inline asm statements.
763   // Since inline asms can contain just about anything, we use this
764   // to call/leverage other check*Hazard routines. Note that
765   // this function doesn't attempt to address all possible inline asm
766   // hazards (good luck), but is a collection of what has been
767   // problematic thus far.
768 
769   // see checkVALUHazards()
770   if (!ST.has12DWordStoreHazard())
771     return 0;
772 
773   const MachineRegisterInfo &MRI = MF.getRegInfo();
774   int WaitStatesNeeded = 0;
775 
776   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
777        I != E; ++I) {
778     const MachineOperand &Op = IA->getOperand(I);
779     if (Op.isReg() && Op.isDef()) {
780       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
781     }
782   }
783 
784   return WaitStatesNeeded;
785 }
786 
787 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
788   const SIInstrInfo *TII = ST.getInstrInfo();
789   const SIRegisterInfo *TRI = ST.getRegisterInfo();
790   const MachineRegisterInfo &MRI = MF.getRegInfo();
791 
792   const MachineOperand *LaneSelectOp =
793       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
794 
795   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
796     return 0;
797 
798   Register LaneSelectReg = LaneSelectOp->getReg();
799   auto IsHazardFn = [TII] (MachineInstr *MI) {
800     return TII->isVALU(*MI);
801   };
802 
803   const int RWLaneWaitStates = 4;
804   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
805                                               RWLaneWaitStates);
806   return RWLaneWaitStates - WaitStatesSince;
807 }
808 
809 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
810   if (!ST.hasRFEHazards())
811     return 0;
812 
813   const SIInstrInfo *TII = ST.getInstrInfo();
814 
815   const int RFEWaitStates = 1;
816 
817   auto IsHazardFn = [TII] (MachineInstr *MI) {
818     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
819   };
820   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
821   return RFEWaitStates - WaitStatesNeeded;
822 }
823 
824 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
825   if (MI->isDebugInstr())
826     return 0;
827 
828   const SIRegisterInfo *TRI = ST.getRegisterInfo();
829   if (!ST.hasSMovFedHazard())
830     return 0;
831 
832   // Check for any instruction reading an SGPR after a write from
833   // s_mov_fed_b32.
834   int MovFedWaitStates = 1;
835   int WaitStatesNeeded = 0;
836 
837   for (const MachineOperand &Use : MI->uses()) {
838     if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
839       continue;
840     auto IsHazardFn = [] (MachineInstr *MI) {
841       return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
842     };
843     int WaitStatesNeededForUse =
844         MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
845                                                  MovFedWaitStates);
846     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
847   }
848 
849   return WaitStatesNeeded;
850 }
851 
852 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
853   const SIInstrInfo *TII = ST.getInstrInfo();
854   const int SMovRelWaitStates = 1;
855   auto IsHazardFn = [TII] (MachineInstr *MI) {
856     return TII->isSALU(*MI);
857   };
858   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
859                                                    SMovRelWaitStates);
860 }
861 
862 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
863   fixVMEMtoScalarWriteHazards(MI);
864   fixVcmpxPermlaneHazards(MI);
865   fixSMEMtoVectorWriteHazards(MI);
866   fixVcmpxExecWARHazard(MI);
867   fixLdsBranchVmemWARHazard(MI);
868 }
869 
870 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
871   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
872     return false;
873 
874   const SIInstrInfo *TII = ST.getInstrInfo();
875   auto IsHazardFn = [TII] (MachineInstr *MI) {
876     return TII->isVOPC(*MI);
877   };
878 
879   auto IsExpiredFn = [] (MachineInstr *MI, int) {
880     if (!MI)
881       return false;
882     unsigned Opc = MI->getOpcode();
883     return SIInstrInfo::isVALU(*MI) &&
884            Opc != AMDGPU::V_NOP_e32 &&
885            Opc != AMDGPU::V_NOP_e64 &&
886            Opc != AMDGPU::V_NOP_sdwa;
887   };
888 
889   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
890       std::numeric_limits<int>::max())
891     return false;
892 
893   // V_NOP will be discarded by SQ.
894   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
895   // which is always a VGPR and available.
896   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
897   Register Reg = Src0->getReg();
898   bool IsUndef = Src0->isUndef();
899   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
900           TII->get(AMDGPU::V_MOV_B32_e32))
901     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
902     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
903 
904   return true;
905 }
906 
907 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
908   if (!ST.hasVMEMtoScalarWriteHazard())
909     return false;
910 
911   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
912     return false;
913 
914   if (MI->getNumDefs() == 0)
915     return false;
916 
917   const SIRegisterInfo *TRI = ST.getRegisterInfo();
918 
919   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
920     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
921         !SIInstrInfo::isFLAT(*I))
922       return false;
923 
924     for (const MachineOperand &Def : MI->defs()) {
925       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
926       if (!Op)
927         continue;
928       return true;
929     }
930     return false;
931   };
932 
933   auto IsExpiredFn = [] (MachineInstr *MI, int) {
934     return MI && (SIInstrInfo::isVALU(*MI) ||
935                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
936                    !MI->getOperand(0).getImm()));
937   };
938 
939   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
940       std::numeric_limits<int>::max())
941     return false;
942 
943   const SIInstrInfo *TII = ST.getInstrInfo();
944   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
945   return true;
946 }
947 
948 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
949   if (!ST.hasSMEMtoVectorWriteHazard())
950     return false;
951 
952   if (!SIInstrInfo::isVALU(*MI))
953     return false;
954 
955   unsigned SDSTName;
956   switch (MI->getOpcode()) {
957   case AMDGPU::V_READLANE_B32:
958   case AMDGPU::V_READLANE_B32_gfx10:
959   case AMDGPU::V_READFIRSTLANE_B32:
960     SDSTName = AMDGPU::OpName::vdst;
961     break;
962   default:
963     SDSTName = AMDGPU::OpName::sdst;
964     break;
965   }
966 
967   const SIInstrInfo *TII = ST.getInstrInfo();
968   const SIRegisterInfo *TRI = ST.getRegisterInfo();
969   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
970   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
971   if (!SDST) {
972     for (const auto &MO : MI->implicit_operands()) {
973       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
974         SDST = &MO;
975         break;
976       }
977     }
978   }
979 
980   if (!SDST)
981     return false;
982 
983   const Register SDSTReg = SDST->getReg();
984   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
985     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
986   };
987 
988   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
989     if (MI) {
990       if (TII->isSALU(*MI)) {
991         switch (MI->getOpcode()) {
992         case AMDGPU::S_SETVSKIP:
993         case AMDGPU::S_VERSION:
994         case AMDGPU::S_WAITCNT_VSCNT:
995         case AMDGPU::S_WAITCNT_VMCNT:
996         case AMDGPU::S_WAITCNT_EXPCNT:
997           // These instructions cannot not mitigate the hazard.
998           return false;
999         case AMDGPU::S_WAITCNT_LGKMCNT:
1000           // Reducing lgkmcnt count to 0 always mitigates the hazard.
1001           return (MI->getOperand(1).getImm() == 0) &&
1002                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1003         case AMDGPU::S_WAITCNT: {
1004           const int64_t Imm = MI->getOperand(0).getImm();
1005           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1006           return (Decoded.LgkmCnt == 0);
1007         }
1008         default:
1009           // SOPP instructions cannot mitigate the hazard.
1010           if (TII->isSOPP(*MI))
1011             return false;
1012           // At this point the SALU can be assumed to mitigate the hazard
1013           // because either:
1014           // (a) it is independent of the at risk SMEM (breaking chain),
1015           // or
1016           // (b) it is dependent on the SMEM, in which case an appropriate
1017           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1018           //     SMEM instruction.
1019           return true;
1020         }
1021       }
1022     }
1023     return false;
1024   };
1025 
1026   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1027       std::numeric_limits<int>::max())
1028     return false;
1029 
1030   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1031           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1032       .addImm(0);
1033   return true;
1034 }
1035 
1036 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1037   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1038     return false;
1039 
1040   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1041   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1042     return false;
1043 
1044   auto IsHazardFn = [TRI] (MachineInstr *I) {
1045     if (SIInstrInfo::isVALU(*I))
1046       return false;
1047     return I->readsRegister(AMDGPU::EXEC, TRI);
1048   };
1049 
1050   const SIInstrInfo *TII = ST.getInstrInfo();
1051   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1052     if (!MI)
1053       return false;
1054     if (SIInstrInfo::isVALU(*MI)) {
1055       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1056         return true;
1057       for (auto MO : MI->implicit_operands())
1058         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1059           return true;
1060     }
1061     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1062         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1063       return true;
1064     return false;
1065   };
1066 
1067   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1068       std::numeric_limits<int>::max())
1069     return false;
1070 
1071   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1072           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1073     .addImm(0xfffe);
1074   return true;
1075 }
1076 
1077 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1078   if (!ST.hasLdsBranchVmemWARHazard())
1079     return false;
1080 
1081   auto IsHazardInst = [] (const MachineInstr *MI) {
1082     if (SIInstrInfo::isDS(*MI))
1083       return 1;
1084     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1085       return 2;
1086     return 0;
1087   };
1088 
1089   auto InstType = IsHazardInst(MI);
1090   if (!InstType)
1091     return false;
1092 
1093   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1094     return I && (IsHazardInst(I) ||
1095                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1096                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1097                   !I->getOperand(1).getImm()));
1098   };
1099 
1100   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1101     if (!I->isBranch())
1102       return false;
1103 
1104     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1105       auto InstType2 = IsHazardInst(I);
1106       return InstType2 && InstType != InstType2;
1107     };
1108 
1109     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1110       if (!I)
1111         return false;
1112 
1113       auto InstType2 = IsHazardInst(I);
1114       if (InstType == InstType2)
1115         return true;
1116 
1117       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1118              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1119              !I->getOperand(1).getImm();
1120     };
1121 
1122     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1123            std::numeric_limits<int>::max();
1124   };
1125 
1126   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1127       std::numeric_limits<int>::max())
1128     return false;
1129 
1130   const SIInstrInfo *TII = ST.getInstrInfo();
1131   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1132           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1133     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1134     .addImm(0);
1135 
1136   return true;
1137 }
1138 
1139 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1140   int NSAtoVMEMWaitStates = 1;
1141 
1142   if (!ST.hasNSAtoVMEMBug())
1143     return 0;
1144 
1145   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1146     return 0;
1147 
1148   const SIInstrInfo *TII = ST.getInstrInfo();
1149   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1150   if (!Offset || (Offset->getImm() & 6) == 0)
1151     return 0;
1152 
1153   auto IsHazardFn = [TII] (MachineInstr *I) {
1154     if (!SIInstrInfo::isMIMG(*I))
1155       return false;
1156     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1157     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1158            TII->getInstSizeInBytes(*I) >= 16;
1159   };
1160 
1161   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1162 }
1163 
1164 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1165   int FPAtomicToDenormModeWaitStates = 3;
1166 
1167   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1168     return 0;
1169 
1170   auto IsHazardFn = [] (MachineInstr *I) {
1171     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1172       return false;
1173     return SIInstrInfo::isFPAtomic(*I);
1174   };
1175 
1176   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1177     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1178       return true;
1179 
1180     switch (MI->getOpcode()) {
1181     case AMDGPU::S_WAITCNT:
1182     case AMDGPU::S_WAITCNT_VSCNT:
1183     case AMDGPU::S_WAITCNT_VMCNT:
1184     case AMDGPU::S_WAITCNT_EXPCNT:
1185     case AMDGPU::S_WAITCNT_LGKMCNT:
1186     case AMDGPU::S_WAITCNT_IDLE:
1187       return true;
1188     default:
1189       break;
1190     }
1191 
1192     return false;
1193   };
1194 
1195 
1196   return FPAtomicToDenormModeWaitStates -
1197          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1198 }
1199 
1200 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1201   assert(SIInstrInfo::isMAI(*MI));
1202 
1203   int WaitStatesNeeded = 0;
1204   unsigned Opc = MI->getOpcode();
1205 
1206   auto IsVALUFn = [] (MachineInstr *MI) {
1207     return SIInstrInfo::isVALU(*MI);
1208   };
1209 
1210   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1211     const int LegacyVALUWritesVGPRWaitStates = 2;
1212     const int VALUWritesExecWaitStates = 4;
1213     const int MaxWaitStates = 4;
1214 
1215     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1216       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1217     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1218 
1219     if (WaitStatesNeeded < MaxWaitStates) {
1220       for (const MachineOperand &Use : MI->explicit_uses()) {
1221         const int MaxWaitStates = 2;
1222 
1223         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1224           continue;
1225 
1226         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1227           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1228         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1229 
1230         if (WaitStatesNeeded == MaxWaitStates)
1231           break;
1232       }
1233     }
1234   }
1235 
1236   auto IsMFMAFn = [] (MachineInstr *MI) {
1237     return SIInstrInfo::isMAI(*MI) &&
1238            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1239            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1240   };
1241 
1242   for (const MachineOperand &Op : MI->explicit_operands()) {
1243     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1244       continue;
1245 
1246     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1247       continue;
1248 
1249     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1250     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1251     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1252     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1253     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1254     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1255     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1256     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1257     const int MaxWaitStates = 18;
1258     Register Reg = Op.getReg();
1259     unsigned HazardDefLatency = 0;
1260 
1261     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1262                               (MachineInstr *MI) {
1263       if (!IsMFMAFn(MI))
1264         return false;
1265       Register DstReg = MI->getOperand(0).getReg();
1266       if (DstReg == Reg)
1267         return false;
1268       HazardDefLatency = std::max(HazardDefLatency,
1269                                   TSchedModel.computeInstrLatency(MI));
1270       return TRI.regsOverlap(DstReg, Reg);
1271     };
1272 
1273     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1274                                                    MaxWaitStates);
1275     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1276     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1277     int OpNo = MI->getOperandNo(&Op);
1278     if (OpNo == SrcCIdx) {
1279       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1280     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1281       switch (HazardDefLatency) {
1282       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1283                break;
1284       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1285                break;
1286       case 16: LLVM_FALLTHROUGH;
1287       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1288                break;
1289       }
1290     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1291       switch (HazardDefLatency) {
1292       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1293                break;
1294       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1295                break;
1296       case 16: LLVM_FALLTHROUGH;
1297       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1298                break;
1299       }
1300     }
1301 
1302     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1303     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1304 
1305     if (WaitStatesNeeded == MaxWaitStates)
1306       return WaitStatesNeeded; // Early exit.
1307 
1308     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1309       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1310         return false;
1311       Register DstReg = MI->getOperand(0).getReg();
1312       return TRI.regsOverlap(Reg, DstReg);
1313     };
1314 
1315     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1316     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1317     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1318     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1319     if (OpNo == SrcCIdx)
1320       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1321     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1322       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1323 
1324     WaitStatesNeededForUse = NeedWaitStates -
1325       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1326     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1327 
1328     if (WaitStatesNeeded == MaxWaitStates)
1329       return WaitStatesNeeded; // Early exit.
1330   }
1331 
1332   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1333     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1334     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1335     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1336     const int MaxWaitStates = 13;
1337     Register DstReg = MI->getOperand(0).getReg();
1338     unsigned HazardDefLatency = 0;
1339 
1340     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1341                          (MachineInstr *MI) {
1342       if (!IsMFMAFn(MI))
1343         return false;
1344       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1345       HazardDefLatency = std::max(HazardDefLatency,
1346                                   TSchedModel.computeInstrLatency(MI));
1347       return TRI.regsOverlap(Reg, DstReg);
1348     };
1349 
1350     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1351     int NeedWaitStates;
1352     switch (HazardDefLatency) {
1353     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1354              break;
1355     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1356              break;
1357     case 16: LLVM_FALLTHROUGH;
1358     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1359              break;
1360     }
1361 
1362     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1363     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1364   }
1365 
1366   return WaitStatesNeeded;
1367 }
1368 
1369 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1370   if (!ST.hasMAIInsts())
1371     return 0;
1372 
1373   int WaitStatesNeeded = 0;
1374 
1375   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1376     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1377   };
1378 
1379   for (const MachineOperand &Op : MI->explicit_uses()) {
1380     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1381       continue;
1382 
1383     Register Reg = Op.getReg();
1384 
1385     const int AccVgprReadLdStWaitStates = 2;
1386     const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1387     const int MaxWaitStates = 2;
1388 
1389     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1390       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1391     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1392 
1393     if (WaitStatesNeeded == MaxWaitStates)
1394       return WaitStatesNeeded; // Early exit.
1395 
1396     auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1397       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1398         return false;
1399       auto IsVALUFn = [] (MachineInstr *MI) {
1400         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1401       };
1402       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1403              std::numeric_limits<int>::max();
1404     };
1405 
1406     WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1407       getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1408     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1409   }
1410 
1411   return WaitStatesNeeded;
1412 }
1413