xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
10b57cec5SDimitry Andric //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements hazard recognizers for scheduling on GCN processors.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric #include "GCNHazardRecognizer.h"
14e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
150b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16*81ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/ScheduleDAG.h"
19e8d8bef9SDimitry Andric #include "llvm/Support/TargetParser.h"
200b57cec5SDimitry Andric 
210b57cec5SDimitry Andric using namespace llvm;
220b57cec5SDimitry Andric 
23*81ad6265SDimitry Andric namespace {
24*81ad6265SDimitry Andric 
25*81ad6265SDimitry Andric struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26*81ad6265SDimitry Andric   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27*81ad6265SDimitry Andric 
28*81ad6265SDimitry Andric   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29*81ad6265SDimitry Andric     if (Arg.getAsInteger(0, Value))
30*81ad6265SDimitry Andric       return O.error("'" + Arg + "' value invalid for uint argument!");
31*81ad6265SDimitry Andric 
32*81ad6265SDimitry Andric     if (Value > 100)
33*81ad6265SDimitry Andric       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34*81ad6265SDimitry Andric 
35*81ad6265SDimitry Andric     return false;
36*81ad6265SDimitry Andric   }
37*81ad6265SDimitry Andric };
38*81ad6265SDimitry Andric 
39*81ad6265SDimitry Andric } // end anonymous namespace
40*81ad6265SDimitry Andric 
41*81ad6265SDimitry Andric static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42*81ad6265SDimitry Andric     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43*81ad6265SDimitry Andric                      cl::desc("Fill a percentage of the latency between "
44*81ad6265SDimitry Andric                               "neighboring MFMA with s_nops."));
45*81ad6265SDimitry Andric 
460b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
47*81ad6265SDimitry Andric // Hazard Recognizer Implementation
480b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
490b57cec5SDimitry Andric 
50fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51fe6060f1SDimitry Andric                                                  const GCNSubtarget &ST);
52fe6060f1SDimitry Andric 
530b57cec5SDimitry Andric GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
540b57cec5SDimitry Andric   IsHazardRecognizerMode(false),
550b57cec5SDimitry Andric   CurrCycleInstr(nullptr),
560b57cec5SDimitry Andric   MF(MF),
570b57cec5SDimitry Andric   ST(MF.getSubtarget<GCNSubtarget>()),
580b57cec5SDimitry Andric   TII(*ST.getInstrInfo()),
590b57cec5SDimitry Andric   TRI(TII.getRegisterInfo()),
600b57cec5SDimitry Andric   ClauseUses(TRI.getNumRegUnits()),
610b57cec5SDimitry Andric   ClauseDefs(TRI.getNumRegUnits()) {
62fe6060f1SDimitry Andric   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
630b57cec5SDimitry Andric   TSchedModel.init(&ST);
64fe6060f1SDimitry Andric   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
650b57cec5SDimitry Andric }
660b57cec5SDimitry Andric 
67e8d8bef9SDimitry Andric void GCNHazardRecognizer::Reset() {
68e8d8bef9SDimitry Andric   EmittedInstrs.clear();
69e8d8bef9SDimitry Andric }
70e8d8bef9SDimitry Andric 
710b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
720b57cec5SDimitry Andric   EmitInstruction(SU->getInstr());
730b57cec5SDimitry Andric }
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
760b57cec5SDimitry Andric   CurrCycleInstr = MI;
770b57cec5SDimitry Andric }
780b57cec5SDimitry Andric 
790b57cec5SDimitry Andric static bool isDivFMas(unsigned Opcode) {
80e8d8bef9SDimitry Andric   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
810b57cec5SDimitry Andric }
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric static bool isSGetReg(unsigned Opcode) {
840b57cec5SDimitry Andric   return Opcode == AMDGPU::S_GETREG_B32;
850b57cec5SDimitry Andric }
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric static bool isSSetReg(unsigned Opcode) {
88e8d8bef9SDimitry Andric   switch (Opcode) {
89e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_B32:
90e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_B32_mode:
91e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_IMM32_B32:
92e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_IMM32_B32_mode:
93e8d8bef9SDimitry Andric     return true;
94e8d8bef9SDimitry Andric   }
95e8d8bef9SDimitry Andric   return false;
960b57cec5SDimitry Andric }
970b57cec5SDimitry Andric 
980b57cec5SDimitry Andric static bool isRWLane(unsigned Opcode) {
990b57cec5SDimitry Andric   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
1000b57cec5SDimitry Andric }
1010b57cec5SDimitry Andric 
1020b57cec5SDimitry Andric static bool isRFE(unsigned Opcode) {
1030b57cec5SDimitry Andric   return Opcode == AMDGPU::S_RFE_B64;
1040b57cec5SDimitry Andric }
1050b57cec5SDimitry Andric 
1060b57cec5SDimitry Andric static bool isSMovRel(unsigned Opcode) {
1070b57cec5SDimitry Andric   switch (Opcode) {
1080b57cec5SDimitry Andric   case AMDGPU::S_MOVRELS_B32:
1090b57cec5SDimitry Andric   case AMDGPU::S_MOVRELS_B64:
1100b57cec5SDimitry Andric   case AMDGPU::S_MOVRELD_B32:
1110b57cec5SDimitry Andric   case AMDGPU::S_MOVRELD_B64:
1120b57cec5SDimitry Andric     return true;
1130b57cec5SDimitry Andric   default:
1140b57cec5SDimitry Andric     return false;
1150b57cec5SDimitry Andric   }
1160b57cec5SDimitry Andric }
1170b57cec5SDimitry Andric 
118fe6060f1SDimitry Andric static bool isDGEMM(unsigned Opcode) {
119*81ad6265SDimitry Andric   return AMDGPU::getMAIIsDGEMM(Opcode);
120fe6060f1SDimitry Andric }
121fe6060f1SDimitry Andric 
122fe6060f1SDimitry Andric static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123fe6060f1SDimitry Andric   unsigned Opcode = MI.getOpcode();
124fe6060f1SDimitry Andric 
125fe6060f1SDimitry Andric   if (!SIInstrInfo::isMAI(MI) ||
126fe6060f1SDimitry Andric       isDGEMM(Opcode) ||
127fe6060f1SDimitry Andric       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128fe6060f1SDimitry Andric       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129fe6060f1SDimitry Andric     return false;
130fe6060f1SDimitry Andric 
131*81ad6265SDimitry Andric   if (!ST.hasGFX940Insts())
132fe6060f1SDimitry Andric     return true;
133*81ad6265SDimitry Andric 
134*81ad6265SDimitry Andric   return AMDGPU::getMAIIsGFX940XDL(Opcode);
135fe6060f1SDimitry Andric }
136fe6060f1SDimitry Andric 
1370b57cec5SDimitry Andric static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
1380b57cec5SDimitry Andric                                     const MachineInstr &MI) {
1390b57cec5SDimitry Andric   if (TII.isAlwaysGDS(MI.getOpcode()))
1400b57cec5SDimitry Andric     return true;
1410b57cec5SDimitry Andric 
1420b57cec5SDimitry Andric   switch (MI.getOpcode()) {
1430b57cec5SDimitry Andric   case AMDGPU::S_SENDMSG:
1440b57cec5SDimitry Andric   case AMDGPU::S_SENDMSGHALT:
1450b57cec5SDimitry Andric   case AMDGPU::S_TTRACEDATA:
1460b57cec5SDimitry Andric     return true;
1470b57cec5SDimitry Andric   // These DS opcodes don't support GDS.
1480b57cec5SDimitry Andric   case AMDGPU::DS_NOP:
1490b57cec5SDimitry Andric   case AMDGPU::DS_PERMUTE_B32:
1500b57cec5SDimitry Andric   case AMDGPU::DS_BPERMUTE_B32:
1510b57cec5SDimitry Andric     return false;
1520b57cec5SDimitry Andric   default:
1530b57cec5SDimitry Andric     if (TII.isDS(MI.getOpcode())) {
1540b57cec5SDimitry Andric       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1550b57cec5SDimitry Andric                                            AMDGPU::OpName::gds);
1560b57cec5SDimitry Andric       if (MI.getOperand(GDS).getImm())
1570b57cec5SDimitry Andric         return true;
1580b57cec5SDimitry Andric     }
1590b57cec5SDimitry Andric     return false;
1600b57cec5SDimitry Andric   }
1610b57cec5SDimitry Andric }
1620b57cec5SDimitry Andric 
1630b57cec5SDimitry Andric static bool isPermlane(const MachineInstr &MI) {
1640b57cec5SDimitry Andric   unsigned Opcode = MI.getOpcode();
165e8d8bef9SDimitry Andric   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166e8d8bef9SDimitry Andric          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
1670b57cec5SDimitry Andric }
1680b57cec5SDimitry Andric 
169*81ad6265SDimitry Andric static bool isLdsDma(const MachineInstr &MI) {
170*81ad6265SDimitry Andric   return SIInstrInfo::isVALU(MI) &&
171*81ad6265SDimitry Andric          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
172*81ad6265SDimitry Andric }
173*81ad6265SDimitry Andric 
1740b57cec5SDimitry Andric static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
1750b57cec5SDimitry Andric   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
1760b57cec5SDimitry Andric                                                      AMDGPU::OpName::simm16);
1770b57cec5SDimitry Andric   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
1780b57cec5SDimitry Andric }
1790b57cec5SDimitry Andric 
1800b57cec5SDimitry Andric ScheduleHazardRecognizer::HazardType
1810b57cec5SDimitry Andric GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
1820b57cec5SDimitry Andric   MachineInstr *MI = SU->getInstr();
183e8d8bef9SDimitry Andric   // If we are not in "HazardRecognizerMode" and therefore not being run from
184e8d8bef9SDimitry Andric   // the scheduler, track possible stalls from hazards but don't insert noops.
185e8d8bef9SDimitry Andric   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
186e8d8bef9SDimitry Andric 
1870b57cec5SDimitry Andric   if (MI->isBundle())
1880b57cec5SDimitry Andric    return NoHazard;
1890b57cec5SDimitry Andric 
1900b57cec5SDimitry Andric   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
191e8d8bef9SDimitry Andric     return HazardType;
1920b57cec5SDimitry Andric 
1930b57cec5SDimitry Andric   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
194e8d8bef9SDimitry Andric     return HazardType;
1950b57cec5SDimitry Andric 
1960b57cec5SDimitry Andric   if (checkFPAtomicToDenormModeHazard(MI) > 0)
197e8d8bef9SDimitry Andric     return HazardType;
1980b57cec5SDimitry Andric 
1990b57cec5SDimitry Andric   if (ST.hasNoDataDepHazard())
2000b57cec5SDimitry Andric     return NoHazard;
2010b57cec5SDimitry Andric 
202fe6060f1SDimitry Andric   // FIXME: Should flat be considered vmem?
203fe6060f1SDimitry Andric   if ((SIInstrInfo::isVMEM(*MI) ||
204fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI))
205fe6060f1SDimitry Andric       && checkVMEMHazards(MI) > 0)
206fe6060f1SDimitry Andric     return HazardType;
207fe6060f1SDimitry Andric 
2080b57cec5SDimitry Andric   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
209e8d8bef9SDimitry Andric     return HazardType;
2100b57cec5SDimitry Andric 
2110b57cec5SDimitry Andric   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
212e8d8bef9SDimitry Andric     return HazardType;
2130b57cec5SDimitry Andric 
2140b57cec5SDimitry Andric   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
215e8d8bef9SDimitry Andric     return HazardType;
2160b57cec5SDimitry Andric 
2170b57cec5SDimitry Andric   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
218e8d8bef9SDimitry Andric     return HazardType;
2190b57cec5SDimitry Andric 
220fe6060f1SDimitry Andric   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
221fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
222fe6060f1SDimitry Andric        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
223fe6060f1SDimitry Andric     return HazardType;
224fe6060f1SDimitry Andric 
2250b57cec5SDimitry Andric   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
226e8d8bef9SDimitry Andric     return HazardType;
2270b57cec5SDimitry Andric 
2280b57cec5SDimitry Andric   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
229e8d8bef9SDimitry Andric     return HazardType;
2300b57cec5SDimitry Andric 
2310b57cec5SDimitry Andric   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
232e8d8bef9SDimitry Andric     return HazardType;
2330b57cec5SDimitry Andric 
234*81ad6265SDimitry Andric   if (((ST.hasReadM0MovRelInterpHazard() &&
235*81ad6265SDimitry Andric         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
236*81ad6265SDimitry Andric        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
237*81ad6265SDimitry Andric        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
238*81ad6265SDimitry Andric        (ST.hasReadM0LdsDirectHazard() &&
239*81ad6265SDimitry Andric         MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
2400b57cec5SDimitry Andric       checkReadM0Hazards(MI) > 0)
241e8d8bef9SDimitry Andric     return HazardType;
2420b57cec5SDimitry Andric 
2430b57cec5SDimitry Andric   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
244e8d8bef9SDimitry Andric     return HazardType;
2450b57cec5SDimitry Andric 
246e8d8bef9SDimitry Andric   if ((SIInstrInfo::isVMEM(*MI) ||
247e8d8bef9SDimitry Andric        SIInstrInfo::isFLAT(*MI) ||
248e8d8bef9SDimitry Andric        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
249e8d8bef9SDimitry Andric     return HazardType;
2500b57cec5SDimitry Andric 
2510b57cec5SDimitry Andric   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
252e8d8bef9SDimitry Andric     return HazardType;
2530b57cec5SDimitry Andric 
2540b57cec5SDimitry Andric   return NoHazard;
2550b57cec5SDimitry Andric }
2560b57cec5SDimitry Andric 
257e8d8bef9SDimitry Andric static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
258e8d8bef9SDimitry Andric                                 unsigned Quantity) {
259e8d8bef9SDimitry Andric   while (Quantity > 0) {
260e8d8bef9SDimitry Andric     unsigned Arg = std::min(Quantity, 8u);
261e8d8bef9SDimitry Andric     Quantity -= Arg;
2620b57cec5SDimitry Andric     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
263e8d8bef9SDimitry Andric         .addImm(Arg - 1);
264e8d8bef9SDimitry Andric   }
2650b57cec5SDimitry Andric }
2660b57cec5SDimitry Andric 
267*81ad6265SDimitry Andric unsigned
268*81ad6265SDimitry Andric GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
269*81ad6265SDimitry Andric   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
270*81ad6265SDimitry Andric   assert(TSchedModel.getWriteProcResBegin(SC) !=
271*81ad6265SDimitry Andric          TSchedModel.getWriteProcResEnd(SC));
272*81ad6265SDimitry Andric   return TSchedModel.getWriteProcResBegin(SC)->Cycles;
273*81ad6265SDimitry Andric }
274*81ad6265SDimitry Andric 
2750b57cec5SDimitry Andric void GCNHazardRecognizer::processBundle() {
2760b57cec5SDimitry Andric   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
2770b57cec5SDimitry Andric   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
2780b57cec5SDimitry Andric   // Check bundled MachineInstr's for hazards.
2790b57cec5SDimitry Andric   for (; MI != E && MI->isInsideBundle(); ++MI) {
2800b57cec5SDimitry Andric     CurrCycleInstr = &*MI;
2810b57cec5SDimitry Andric     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
2820b57cec5SDimitry Andric 
283e8d8bef9SDimitry Andric     if (IsHazardRecognizerMode) {
2840b57cec5SDimitry Andric       fixHazards(CurrCycleInstr);
2850b57cec5SDimitry Andric 
286e8d8bef9SDimitry Andric       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
287e8d8bef9SDimitry Andric     }
2880b57cec5SDimitry Andric 
2890b57cec5SDimitry Andric     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
2900b57cec5SDimitry Andric     // include the bundled MI directly after, only add a maximum of
2910b57cec5SDimitry Andric     // (MaxLookAhead - 1) noops to EmittedInstrs.
2920b57cec5SDimitry Andric     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
2930b57cec5SDimitry Andric       EmittedInstrs.push_front(nullptr);
2940b57cec5SDimitry Andric 
2950b57cec5SDimitry Andric     EmittedInstrs.push_front(CurrCycleInstr);
2960b57cec5SDimitry Andric     EmittedInstrs.resize(MaxLookAhead);
2970b57cec5SDimitry Andric   }
2980b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
2990b57cec5SDimitry Andric }
3000b57cec5SDimitry Andric 
3010b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
3020b57cec5SDimitry Andric   IsHazardRecognizerMode = true;
3030b57cec5SDimitry Andric   CurrCycleInstr = MI;
3040b57cec5SDimitry Andric   unsigned W = PreEmitNoopsCommon(MI);
3050b57cec5SDimitry Andric   fixHazards(MI);
3060b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
3070b57cec5SDimitry Andric   return W;
3080b57cec5SDimitry Andric }
3090b57cec5SDimitry Andric 
3100b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
3110b57cec5SDimitry Andric   if (MI->isBundle())
3120b57cec5SDimitry Andric     return 0;
3130b57cec5SDimitry Andric 
314e8d8bef9SDimitry Andric   int WaitStates = 0;
3150b57cec5SDimitry Andric 
3160b57cec5SDimitry Andric   if (SIInstrInfo::isSMRD(*MI))
3170b57cec5SDimitry Andric     return std::max(WaitStates, checkSMRDHazards(MI));
3180b57cec5SDimitry Andric 
3190b57cec5SDimitry Andric   if (ST.hasNSAtoVMEMBug())
3200b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
3210b57cec5SDimitry Andric 
3220b57cec5SDimitry Andric   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
3230b57cec5SDimitry Andric 
3240b57cec5SDimitry Andric   if (ST.hasNoDataDepHazard())
3250b57cec5SDimitry Andric     return WaitStates;
3260b57cec5SDimitry Andric 
327fe6060f1SDimitry Andric   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
328fe6060f1SDimitry Andric     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
329fe6060f1SDimitry Andric 
3300b57cec5SDimitry Andric   if (SIInstrInfo::isVALU(*MI))
3310b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
3320b57cec5SDimitry Andric 
3330b57cec5SDimitry Andric   if (SIInstrInfo::isDPP(*MI))
3340b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
3350b57cec5SDimitry Andric 
3360b57cec5SDimitry Andric   if (isDivFMas(MI->getOpcode()))
3370b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
3380b57cec5SDimitry Andric 
3390b57cec5SDimitry Andric   if (isRWLane(MI->getOpcode()))
3400b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
3410b57cec5SDimitry Andric 
342fe6060f1SDimitry Andric   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
343fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
344fe6060f1SDimitry Andric        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
345fe6060f1SDimitry Andric     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
346fe6060f1SDimitry Andric 
3470b57cec5SDimitry Andric   if (MI->isInlineAsm())
3480b57cec5SDimitry Andric     return std::max(WaitStates, checkInlineAsmHazards(MI));
3490b57cec5SDimitry Andric 
3500b57cec5SDimitry Andric   if (isSGetReg(MI->getOpcode()))
3510b57cec5SDimitry Andric     return std::max(WaitStates, checkGetRegHazards(MI));
3520b57cec5SDimitry Andric 
3530b57cec5SDimitry Andric   if (isSSetReg(MI->getOpcode()))
3540b57cec5SDimitry Andric     return std::max(WaitStates, checkSetRegHazards(MI));
3550b57cec5SDimitry Andric 
3560b57cec5SDimitry Andric   if (isRFE(MI->getOpcode()))
3570b57cec5SDimitry Andric     return std::max(WaitStates, checkRFEHazards(MI));
3580b57cec5SDimitry Andric 
359*81ad6265SDimitry Andric   if ((ST.hasReadM0MovRelInterpHazard() &&
360*81ad6265SDimitry Andric        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) ||
361*81ad6265SDimitry Andric       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
362*81ad6265SDimitry Andric       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
363*81ad6265SDimitry Andric       (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
3640b57cec5SDimitry Andric     return std::max(WaitStates, checkReadM0Hazards(MI));
3650b57cec5SDimitry Andric 
3660b57cec5SDimitry Andric   if (SIInstrInfo::isMAI(*MI))
3670b57cec5SDimitry Andric     return std::max(WaitStates, checkMAIHazards(MI));
3680b57cec5SDimitry Andric 
369e8d8bef9SDimitry Andric   if (SIInstrInfo::isVMEM(*MI) ||
370e8d8bef9SDimitry Andric       SIInstrInfo::isFLAT(*MI) ||
371e8d8bef9SDimitry Andric       SIInstrInfo::isDS(*MI))
3720b57cec5SDimitry Andric     return std::max(WaitStates, checkMAILdStHazards(MI));
3730b57cec5SDimitry Andric 
3740b57cec5SDimitry Andric   return WaitStates;
3750b57cec5SDimitry Andric }
3760b57cec5SDimitry Andric 
3770b57cec5SDimitry Andric void GCNHazardRecognizer::EmitNoop() {
3780b57cec5SDimitry Andric   EmittedInstrs.push_front(nullptr);
3790b57cec5SDimitry Andric }
3800b57cec5SDimitry Andric 
3810b57cec5SDimitry Andric void GCNHazardRecognizer::AdvanceCycle() {
3820b57cec5SDimitry Andric   // When the scheduler detects a stall, it will call AdvanceCycle() without
3830b57cec5SDimitry Andric   // emitting any instructions.
384e8d8bef9SDimitry Andric   if (!CurrCycleInstr) {
385e8d8bef9SDimitry Andric     EmittedInstrs.push_front(nullptr);
3860b57cec5SDimitry Andric     return;
387e8d8bef9SDimitry Andric   }
3880b57cec5SDimitry Andric 
3890b57cec5SDimitry Andric   if (CurrCycleInstr->isBundle()) {
3900b57cec5SDimitry Andric     processBundle();
3910b57cec5SDimitry Andric     return;
3920b57cec5SDimitry Andric   }
3930b57cec5SDimitry Andric 
3940b57cec5SDimitry Andric   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
395349cc55cSDimitry Andric   if (!NumWaitStates) {
396349cc55cSDimitry Andric     CurrCycleInstr = nullptr;
397349cc55cSDimitry Andric     return;
398349cc55cSDimitry Andric   }
3990b57cec5SDimitry Andric 
4000b57cec5SDimitry Andric   // Keep track of emitted instructions
4010b57cec5SDimitry Andric   EmittedInstrs.push_front(CurrCycleInstr);
4020b57cec5SDimitry Andric 
4030b57cec5SDimitry Andric   // Add a nullptr for each additional wait state after the first.  Make sure
4040b57cec5SDimitry Andric   // not to add more than getMaxLookAhead() items to the list, since we
4050b57cec5SDimitry Andric   // truncate the list to that size right after this loop.
4060b57cec5SDimitry Andric   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
4070b57cec5SDimitry Andric        i < e; ++i) {
4080b57cec5SDimitry Andric     EmittedInstrs.push_front(nullptr);
4090b57cec5SDimitry Andric   }
4100b57cec5SDimitry Andric 
4110b57cec5SDimitry Andric   // getMaxLookahead() is the largest number of wait states we will ever need
4120b57cec5SDimitry Andric   // to insert, so there is no point in keeping track of more than that many
4130b57cec5SDimitry Andric   // wait states.
4140b57cec5SDimitry Andric   EmittedInstrs.resize(getMaxLookAhead());
4150b57cec5SDimitry Andric 
4160b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
4170b57cec5SDimitry Andric }
4180b57cec5SDimitry Andric 
4190b57cec5SDimitry Andric void GCNHazardRecognizer::RecedeCycle() {
4200b57cec5SDimitry Andric   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
4210b57cec5SDimitry Andric }
4220b57cec5SDimitry Andric 
4230b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4240b57cec5SDimitry Andric // Helper Functions
4250b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4260b57cec5SDimitry Andric 
427*81ad6265SDimitry Andric typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
428*81ad6265SDimitry Andric 
429fe6060f1SDimitry Andric typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
430*81ad6265SDimitry Andric typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
431*81ad6265SDimitry Andric 
432*81ad6265SDimitry Andric // Search for a hazard in a block and its predecessors.
433*81ad6265SDimitry Andric template <typename StateT>
434*81ad6265SDimitry Andric static bool
435*81ad6265SDimitry Andric hasHazard(StateT State,
436*81ad6265SDimitry Andric           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
437*81ad6265SDimitry Andric           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
438*81ad6265SDimitry Andric           const MachineBasicBlock *MBB,
439*81ad6265SDimitry Andric           MachineBasicBlock::const_reverse_instr_iterator I,
440*81ad6265SDimitry Andric           DenseSet<const MachineBasicBlock *> &Visited) {
441*81ad6265SDimitry Andric   for (auto E = MBB->instr_rend(); I != E; ++I) {
442*81ad6265SDimitry Andric     // No need to look at parent BUNDLE instructions.
443*81ad6265SDimitry Andric     if (I->isBundle())
444*81ad6265SDimitry Andric       continue;
445*81ad6265SDimitry Andric 
446*81ad6265SDimitry Andric     switch (IsHazard(State, *I)) {
447*81ad6265SDimitry Andric     case HazardFound:
448*81ad6265SDimitry Andric       return true;
449*81ad6265SDimitry Andric     case HazardExpired:
450*81ad6265SDimitry Andric       return false;
451*81ad6265SDimitry Andric     default:
452*81ad6265SDimitry Andric       // Continue search
453*81ad6265SDimitry Andric       break;
454*81ad6265SDimitry Andric     }
455*81ad6265SDimitry Andric 
456*81ad6265SDimitry Andric     if (I->isInlineAsm() || I->isMetaInstruction())
457*81ad6265SDimitry Andric       continue;
458*81ad6265SDimitry Andric 
459*81ad6265SDimitry Andric     UpdateState(State, *I);
460*81ad6265SDimitry Andric   }
461*81ad6265SDimitry Andric 
462*81ad6265SDimitry Andric   for (MachineBasicBlock *Pred : MBB->predecessors()) {
463*81ad6265SDimitry Andric     if (!Visited.insert(Pred).second)
464*81ad6265SDimitry Andric       continue;
465*81ad6265SDimitry Andric 
466*81ad6265SDimitry Andric     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
467*81ad6265SDimitry Andric                   Visited))
468*81ad6265SDimitry Andric       return true;
469*81ad6265SDimitry Andric   }
470*81ad6265SDimitry Andric 
471*81ad6265SDimitry Andric   return false;
472*81ad6265SDimitry Andric }
4730b57cec5SDimitry Andric 
4740b57cec5SDimitry Andric // Returns a minimum wait states since \p I walking all predecessors.
4750b57cec5SDimitry Andric // Only scans until \p IsExpired does not return true.
4760b57cec5SDimitry Andric // Can only be run in a hazard recognizer mode.
477*81ad6265SDimitry Andric static int getWaitStatesSince(
478*81ad6265SDimitry Andric     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
479*81ad6265SDimitry Andric     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
480*81ad6265SDimitry Andric     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
481*81ad6265SDimitry Andric     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
4820b57cec5SDimitry Andric   for (auto E = MBB->instr_rend(); I != E; ++I) {
4830b57cec5SDimitry Andric     // Don't add WaitStates for parent BUNDLE instructions.
4840b57cec5SDimitry Andric     if (I->isBundle())
4850b57cec5SDimitry Andric       continue;
4860b57cec5SDimitry Andric 
487fe6060f1SDimitry Andric     if (IsHazard(*I))
4880b57cec5SDimitry Andric       return WaitStates;
4890b57cec5SDimitry Andric 
490349cc55cSDimitry Andric     if (I->isInlineAsm())
4910b57cec5SDimitry Andric       continue;
4920b57cec5SDimitry Andric 
493*81ad6265SDimitry Andric     WaitStates += GetNumWaitStates(*I);
4940b57cec5SDimitry Andric 
495fe6060f1SDimitry Andric     if (IsExpired(*I, WaitStates))
4960b57cec5SDimitry Andric       return std::numeric_limits<int>::max();
4970b57cec5SDimitry Andric   }
4980b57cec5SDimitry Andric 
499fe6060f1SDimitry Andric   int MinWaitStates = std::numeric_limits<int>::max();
5000b57cec5SDimitry Andric   for (MachineBasicBlock *Pred : MBB->predecessors()) {
5010b57cec5SDimitry Andric     if (!Visited.insert(Pred).second)
5020b57cec5SDimitry Andric       continue;
5030b57cec5SDimitry Andric 
504*81ad6265SDimitry Andric     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
505*81ad6265SDimitry Andric                                IsExpired, Visited, GetNumWaitStates);
5060b57cec5SDimitry Andric 
507fe6060f1SDimitry Andric     MinWaitStates = std::min(MinWaitStates, W);
5080b57cec5SDimitry Andric   }
5090b57cec5SDimitry Andric 
5100b57cec5SDimitry Andric   return MinWaitStates;
5110b57cec5SDimitry Andric }
5120b57cec5SDimitry Andric 
5130b57cec5SDimitry Andric static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
514fe6060f1SDimitry Andric                               const MachineInstr *MI, IsExpiredFn IsExpired) {
5150b57cec5SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
5160b57cec5SDimitry Andric   return getWaitStatesSince(IsHazard, MI->getParent(),
5170b57cec5SDimitry Andric                             std::next(MI->getReverseIterator()),
5180b57cec5SDimitry Andric                             0, IsExpired, Visited);
5190b57cec5SDimitry Andric }
5200b57cec5SDimitry Andric 
5210b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
5220b57cec5SDimitry Andric   if (IsHazardRecognizerMode) {
523fe6060f1SDimitry Andric     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
5240b57cec5SDimitry Andric       return WaitStates >= Limit;
5250b57cec5SDimitry Andric     };
5260b57cec5SDimitry Andric     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
5270b57cec5SDimitry Andric   }
5280b57cec5SDimitry Andric 
5290b57cec5SDimitry Andric   int WaitStates = 0;
5300b57cec5SDimitry Andric   for (MachineInstr *MI : EmittedInstrs) {
5310b57cec5SDimitry Andric     if (MI) {
532fe6060f1SDimitry Andric       if (IsHazard(*MI))
5330b57cec5SDimitry Andric         return WaitStates;
5340b57cec5SDimitry Andric 
5350b57cec5SDimitry Andric       if (MI->isInlineAsm())
5360b57cec5SDimitry Andric         continue;
5370b57cec5SDimitry Andric     }
5380b57cec5SDimitry Andric     ++WaitStates;
5390b57cec5SDimitry Andric 
5400b57cec5SDimitry Andric     if (WaitStates >= Limit)
5410b57cec5SDimitry Andric       break;
5420b57cec5SDimitry Andric   }
5430b57cec5SDimitry Andric   return std::numeric_limits<int>::max();
5440b57cec5SDimitry Andric }
5450b57cec5SDimitry Andric 
5460b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
5470b57cec5SDimitry Andric                                                IsHazardFn IsHazardDef,
5480b57cec5SDimitry Andric                                                int Limit) {
5490b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5500b57cec5SDimitry Andric 
551fe6060f1SDimitry Andric   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
552fe6060f1SDimitry Andric     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
5530b57cec5SDimitry Andric   };
5540b57cec5SDimitry Andric 
5550b57cec5SDimitry Andric   return getWaitStatesSince(IsHazardFn, Limit);
5560b57cec5SDimitry Andric }
5570b57cec5SDimitry Andric 
5580b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
5590b57cec5SDimitry Andric                                                   int Limit) {
560fe6060f1SDimitry Andric   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
561fe6060f1SDimitry Andric     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
5620b57cec5SDimitry Andric   };
5630b57cec5SDimitry Andric 
5640b57cec5SDimitry Andric   return getWaitStatesSince(IsHazardFn, Limit);
5650b57cec5SDimitry Andric }
5660b57cec5SDimitry Andric 
5670b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5680b57cec5SDimitry Andric // No-op Hazard Detection
5690b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5700b57cec5SDimitry Andric 
571e8d8bef9SDimitry Andric static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
572e8d8bef9SDimitry Andric                         MCRegister Reg) {
5730b57cec5SDimitry Andric   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
5740b57cec5SDimitry Andric     BV.set(*RUI);
5750b57cec5SDimitry Andric }
5760b57cec5SDimitry Andric 
5770b57cec5SDimitry Andric static void addRegsToSet(const SIRegisterInfo &TRI,
5780b57cec5SDimitry Andric                          iterator_range<MachineInstr::const_mop_iterator> Ops,
5790b57cec5SDimitry Andric                          BitVector &Set) {
5800b57cec5SDimitry Andric   for (const MachineOperand &Op : Ops) {
5810b57cec5SDimitry Andric     if (Op.isReg())
582e8d8bef9SDimitry Andric       addRegUnits(TRI, Set, Op.getReg().asMCReg());
5830b57cec5SDimitry Andric   }
5840b57cec5SDimitry Andric }
5850b57cec5SDimitry Andric 
5860b57cec5SDimitry Andric void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
5870b57cec5SDimitry Andric   // XXX: Do we need to worry about implicit operands
5880b57cec5SDimitry Andric   addRegsToSet(TRI, MI.defs(), ClauseDefs);
5890b57cec5SDimitry Andric   addRegsToSet(TRI, MI.uses(), ClauseUses);
5900b57cec5SDimitry Andric }
5910b57cec5SDimitry Andric 
5925ffd83dbSDimitry Andric static bool breaksSMEMSoftClause(MachineInstr *MI) {
5935ffd83dbSDimitry Andric   return !SIInstrInfo::isSMRD(*MI);
5945ffd83dbSDimitry Andric }
5955ffd83dbSDimitry Andric 
5965ffd83dbSDimitry Andric static bool breaksVMEMSoftClause(MachineInstr *MI) {
5975ffd83dbSDimitry Andric   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
5985ffd83dbSDimitry Andric }
5995ffd83dbSDimitry Andric 
6000b57cec5SDimitry Andric int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
6010b57cec5SDimitry Andric   // SMEM soft clause are only present on VI+, and only matter if xnack is
6020b57cec5SDimitry Andric   // enabled.
6030b57cec5SDimitry Andric   if (!ST.isXNACKEnabled())
6040b57cec5SDimitry Andric     return 0;
6050b57cec5SDimitry Andric 
6060b57cec5SDimitry Andric   bool IsSMRD = TII.isSMRD(*MEM);
6070b57cec5SDimitry Andric 
6080b57cec5SDimitry Andric   resetClause();
6090b57cec5SDimitry Andric 
6100b57cec5SDimitry Andric   // A soft-clause is any group of consecutive SMEM instructions.  The
6110b57cec5SDimitry Andric   // instructions in this group may return out of order and/or may be
6120b57cec5SDimitry Andric   // replayed (i.e. the same instruction issued more than once).
6130b57cec5SDimitry Andric   //
6140b57cec5SDimitry Andric   // In order to handle these situations correctly we need to make sure that
6150b57cec5SDimitry Andric   // when a clause has more than one instruction, no instruction in the clause
6160b57cec5SDimitry Andric   // writes to a register that is read by another instruction in the clause
617*81ad6265SDimitry Andric   // (including itself). If we encounter this situation, we need to break the
6180b57cec5SDimitry Andric   // clause by inserting a non SMEM instruction.
6190b57cec5SDimitry Andric 
6200b57cec5SDimitry Andric   for (MachineInstr *MI : EmittedInstrs) {
6210b57cec5SDimitry Andric     // When we hit a non-SMEM instruction then we have passed the start of the
6220b57cec5SDimitry Andric     // clause and we can stop.
6230b57cec5SDimitry Andric     if (!MI)
6240b57cec5SDimitry Andric       break;
6250b57cec5SDimitry Andric 
6265ffd83dbSDimitry Andric     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
6270b57cec5SDimitry Andric       break;
6280b57cec5SDimitry Andric 
6290b57cec5SDimitry Andric     addClauseInst(*MI);
6300b57cec5SDimitry Andric   }
6310b57cec5SDimitry Andric 
6320b57cec5SDimitry Andric   if (ClauseDefs.none())
6330b57cec5SDimitry Andric     return 0;
6340b57cec5SDimitry Andric 
6350b57cec5SDimitry Andric   // We need to make sure not to put loads and stores in the same clause if they
6360b57cec5SDimitry Andric   // use the same address. For now, just start a new clause whenever we see a
6370b57cec5SDimitry Andric   // store.
6380b57cec5SDimitry Andric   if (MEM->mayStore())
6390b57cec5SDimitry Andric     return 1;
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   addClauseInst(*MEM);
6420b57cec5SDimitry Andric 
6430b57cec5SDimitry Andric   // If the set of defs and uses intersect then we cannot add this instruction
6440b57cec5SDimitry Andric   // to the clause, so we have a hazard.
6450b57cec5SDimitry Andric   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
6460b57cec5SDimitry Andric }
6470b57cec5SDimitry Andric 
6480b57cec5SDimitry Andric int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
6490b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
6500b57cec5SDimitry Andric 
6510b57cec5SDimitry Andric   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
6520b57cec5SDimitry Andric 
6530b57cec5SDimitry Andric   // This SMRD hazard only affects SI.
6540b57cec5SDimitry Andric   if (!ST.hasSMRDReadVALUDefHazard())
6550b57cec5SDimitry Andric     return WaitStatesNeeded;
6560b57cec5SDimitry Andric 
6570b57cec5SDimitry Andric   // A read of an SGPR by SMRD instruction requires 4 wait states when the
6580b57cec5SDimitry Andric   // SGPR was written by a VALU instruction.
6590b57cec5SDimitry Andric   int SmrdSgprWaitStates = 4;
660fe6060f1SDimitry Andric   auto IsHazardDefFn = [this](const MachineInstr &MI) {
661fe6060f1SDimitry Andric     return TII.isVALU(MI);
662fe6060f1SDimitry Andric   };
663fe6060f1SDimitry Andric   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
664fe6060f1SDimitry Andric     return TII.isSALU(MI);
665fe6060f1SDimitry Andric   };
6660b57cec5SDimitry Andric 
6670b57cec5SDimitry Andric   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
6680b57cec5SDimitry Andric 
6690b57cec5SDimitry Andric   for (const MachineOperand &Use : SMRD->uses()) {
6700b57cec5SDimitry Andric     if (!Use.isReg())
6710b57cec5SDimitry Andric       continue;
6720b57cec5SDimitry Andric     int WaitStatesNeededForUse =
6730b57cec5SDimitry Andric         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
6740b57cec5SDimitry Andric                                                    SmrdSgprWaitStates);
6750b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6760b57cec5SDimitry Andric 
6770b57cec5SDimitry Andric     // This fixes what appears to be undocumented hardware behavior in SI where
6780b57cec5SDimitry Andric     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
6790b57cec5SDimitry Andric     // needs some number of nops in between. We don't know how many we need, but
6800b57cec5SDimitry Andric     // let's use 4. This wasn't discovered before probably because the only
6810b57cec5SDimitry Andric     // case when this happens is when we expand a 64-bit pointer into a full
6820b57cec5SDimitry Andric     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
6830b57cec5SDimitry Andric     // probably never encountered in the closed-source land.
6840b57cec5SDimitry Andric     if (IsBufferSMRD) {
6850b57cec5SDimitry Andric       int WaitStatesNeededForUse =
6860b57cec5SDimitry Andric         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
6870b57cec5SDimitry Andric                                                    IsBufferHazardDefFn,
6880b57cec5SDimitry Andric                                                    SmrdSgprWaitStates);
6890b57cec5SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6900b57cec5SDimitry Andric     }
6910b57cec5SDimitry Andric   }
6920b57cec5SDimitry Andric 
6930b57cec5SDimitry Andric   return WaitStatesNeeded;
6940b57cec5SDimitry Andric }
6950b57cec5SDimitry Andric 
6960b57cec5SDimitry Andric int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
6970b57cec5SDimitry Andric   if (!ST.hasVMEMReadSGPRVALUDefHazard())
6980b57cec5SDimitry Andric     return 0;
6990b57cec5SDimitry Andric 
7000b57cec5SDimitry Andric   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
7010b57cec5SDimitry Andric 
7020b57cec5SDimitry Andric   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
7030b57cec5SDimitry Andric   // SGPR was written by a VALU Instruction.
7040b57cec5SDimitry Andric   const int VmemSgprWaitStates = 5;
705fe6060f1SDimitry Andric   auto IsHazardDefFn = [this](const MachineInstr &MI) {
706fe6060f1SDimitry Andric     return TII.isVALU(MI);
707fe6060f1SDimitry Andric   };
7080b57cec5SDimitry Andric   for (const MachineOperand &Use : VMEM->uses()) {
709fe6060f1SDimitry Andric     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
7100b57cec5SDimitry Andric       continue;
7110b57cec5SDimitry Andric 
7120b57cec5SDimitry Andric     int WaitStatesNeededForUse =
7130b57cec5SDimitry Andric         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
7140b57cec5SDimitry Andric                                                    VmemSgprWaitStates);
7150b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7160b57cec5SDimitry Andric   }
7170b57cec5SDimitry Andric   return WaitStatesNeeded;
7180b57cec5SDimitry Andric }
7190b57cec5SDimitry Andric 
7200b57cec5SDimitry Andric int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
7210b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
7220b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7230b57cec5SDimitry Andric 
7240b57cec5SDimitry Andric   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
7250b57cec5SDimitry Andric   int DppVgprWaitStates = 2;
7260b57cec5SDimitry Andric   int DppExecWaitStates = 5;
7270b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
728fe6060f1SDimitry Andric   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
729fe6060f1SDimitry Andric     return TII->isVALU(MI);
730fe6060f1SDimitry Andric   };
7310b57cec5SDimitry Andric 
7320b57cec5SDimitry Andric   for (const MachineOperand &Use : DPP->uses()) {
7330b57cec5SDimitry Andric     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
7340b57cec5SDimitry Andric       continue;
7350b57cec5SDimitry Andric     int WaitStatesNeededForUse =
736fe6060f1SDimitry Andric         DppVgprWaitStates - getWaitStatesSinceDef(
737fe6060f1SDimitry Andric                                 Use.getReg(),
738fe6060f1SDimitry Andric                                 [](const MachineInstr &) { return true; },
7390b57cec5SDimitry Andric                                 DppVgprWaitStates);
7400b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7410b57cec5SDimitry Andric   }
7420b57cec5SDimitry Andric 
7430b57cec5SDimitry Andric   WaitStatesNeeded = std::max(
7440b57cec5SDimitry Andric       WaitStatesNeeded,
7450b57cec5SDimitry Andric       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
7460b57cec5SDimitry Andric                                                 DppExecWaitStates));
7470b57cec5SDimitry Andric 
7480b57cec5SDimitry Andric   return WaitStatesNeeded;
7490b57cec5SDimitry Andric }
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
7520b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7530b57cec5SDimitry Andric 
7540b57cec5SDimitry Andric   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
7550b57cec5SDimitry Andric   // instruction.
7560b57cec5SDimitry Andric   const int DivFMasWaitStates = 4;
757fe6060f1SDimitry Andric   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758fe6060f1SDimitry Andric     return TII->isVALU(MI);
759fe6060f1SDimitry Andric   };
7600b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
7610b57cec5SDimitry Andric                                                DivFMasWaitStates);
7620b57cec5SDimitry Andric 
7630b57cec5SDimitry Andric   return DivFMasWaitStates - WaitStatesNeeded;
7640b57cec5SDimitry Andric }
7650b57cec5SDimitry Andric 
7660b57cec5SDimitry Andric int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
7670b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7680b57cec5SDimitry Andric   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
7690b57cec5SDimitry Andric 
7700b57cec5SDimitry Andric   const int GetRegWaitStates = 2;
771fe6060f1SDimitry Andric   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
772fe6060f1SDimitry Andric     return GetRegHWReg == getHWReg(TII, MI);
7730b57cec5SDimitry Andric   };
7740b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
7750b57cec5SDimitry Andric 
7760b57cec5SDimitry Andric   return GetRegWaitStates - WaitStatesNeeded;
7770b57cec5SDimitry Andric }
7780b57cec5SDimitry Andric 
7790b57cec5SDimitry Andric int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
7800b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7810b57cec5SDimitry Andric   unsigned HWReg = getHWReg(TII, *SetRegInstr);
7820b57cec5SDimitry Andric 
7830b57cec5SDimitry Andric   const int SetRegWaitStates = ST.getSetRegWaitStates();
784fe6060f1SDimitry Andric   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
785fe6060f1SDimitry Andric     return HWReg == getHWReg(TII, MI);
7860b57cec5SDimitry Andric   };
7870b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
7880b57cec5SDimitry Andric   return SetRegWaitStates - WaitStatesNeeded;
7890b57cec5SDimitry Andric }
7900b57cec5SDimitry Andric 
7910b57cec5SDimitry Andric int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
7920b57cec5SDimitry Andric   if (!MI.mayStore())
7930b57cec5SDimitry Andric     return -1;
7940b57cec5SDimitry Andric 
7950b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7960b57cec5SDimitry Andric   unsigned Opcode = MI.getOpcode();
7970b57cec5SDimitry Andric   const MCInstrDesc &Desc = MI.getDesc();
7980b57cec5SDimitry Andric 
7990b57cec5SDimitry Andric   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
8000b57cec5SDimitry Andric   int VDataRCID = -1;
8010b57cec5SDimitry Andric   if (VDataIdx != -1)
8020b57cec5SDimitry Andric     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
8030b57cec5SDimitry Andric 
8040b57cec5SDimitry Andric   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
8050b57cec5SDimitry Andric     // There is no hazard if the instruction does not use vector regs
8060b57cec5SDimitry Andric     // (like wbinvl1)
8070b57cec5SDimitry Andric     if (VDataIdx == -1)
8080b57cec5SDimitry Andric       return -1;
8090b57cec5SDimitry Andric     // For MUBUF/MTBUF instructions this hazard only exists if the
8100b57cec5SDimitry Andric     // instruction is not using a register in the soffset field.
8110b57cec5SDimitry Andric     const MachineOperand *SOffset =
8120b57cec5SDimitry Andric         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
8130b57cec5SDimitry Andric     // If we have no soffset operand, then assume this field has been
8140b57cec5SDimitry Andric     // hardcoded to zero.
8150b57cec5SDimitry Andric     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
8160b57cec5SDimitry Andric         (!SOffset || !SOffset->isReg()))
8170b57cec5SDimitry Andric       return VDataIdx;
8180b57cec5SDimitry Andric   }
8190b57cec5SDimitry Andric 
8200b57cec5SDimitry Andric   // MIMG instructions create a hazard if they don't use a 256-bit T# and
8210b57cec5SDimitry Andric   // the store size is greater than 8 bytes and they have more than two bits
8220b57cec5SDimitry Andric   // of their dmask set.
8230b57cec5SDimitry Andric   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
8240b57cec5SDimitry Andric   if (TII->isMIMG(MI)) {
8250b57cec5SDimitry Andric     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
8260b57cec5SDimitry Andric     assert(SRsrcIdx != -1 &&
8270b57cec5SDimitry Andric            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
8280b57cec5SDimitry Andric     (void)SRsrcIdx;
8290b57cec5SDimitry Andric   }
8300b57cec5SDimitry Andric 
8310b57cec5SDimitry Andric   if (TII->isFLAT(MI)) {
8320b57cec5SDimitry Andric     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
8330b57cec5SDimitry Andric     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
8340b57cec5SDimitry Andric       return DataIdx;
8350b57cec5SDimitry Andric   }
8360b57cec5SDimitry Andric 
8370b57cec5SDimitry Andric   return -1;
8380b57cec5SDimitry Andric }
8390b57cec5SDimitry Andric 
840e8d8bef9SDimitry Andric int
841e8d8bef9SDimitry Andric GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
8420b57cec5SDimitry Andric                                             const MachineRegisterInfo &MRI) {
8430b57cec5SDimitry Andric   // Helper to check for the hazard where VMEM instructions that store more than
8440b57cec5SDimitry Andric   // 8 bytes can have there store data over written by the next instruction.
8450b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
8460b57cec5SDimitry Andric 
847*81ad6265SDimitry Andric   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
8480b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
8490b57cec5SDimitry Andric 
850fe6060f1SDimitry Andric   if (!TRI->isVectorRegister(MRI, Def.getReg()))
8510b57cec5SDimitry Andric     return WaitStatesNeeded;
8528bcb0991SDimitry Andric   Register Reg = Def.getReg();
853fe6060f1SDimitry Andric   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
854fe6060f1SDimitry Andric     int DataIdx = createsVALUHazard(MI);
8550b57cec5SDimitry Andric     return DataIdx >= 0 &&
856fe6060f1SDimitry Andric            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
8570b57cec5SDimitry Andric   };
8580b57cec5SDimitry Andric   int WaitStatesNeededForDef =
8590b57cec5SDimitry Andric     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
8600b57cec5SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
8610b57cec5SDimitry Andric 
8620b57cec5SDimitry Andric   return WaitStatesNeeded;
8630b57cec5SDimitry Andric }
8640b57cec5SDimitry Andric 
8650b57cec5SDimitry Andric int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
866*81ad6265SDimitry Andric   int WaitStatesNeeded = 0;
867*81ad6265SDimitry Andric 
868*81ad6265SDimitry Andric   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
869*81ad6265SDimitry Andric     const int TransDefWaitstates = 1;
870*81ad6265SDimitry Andric 
871*81ad6265SDimitry Andric     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
872*81ad6265SDimitry Andric       if (!SIInstrInfo::isTRANS(MI))
873*81ad6265SDimitry Andric         return false;
874*81ad6265SDimitry Andric       const SIRegisterInfo *TRI = ST.getRegisterInfo();
875*81ad6265SDimitry Andric       const SIInstrInfo *TII = ST.getInstrInfo();
876*81ad6265SDimitry Andric       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
877*81ad6265SDimitry Andric 
878*81ad6265SDimitry Andric       for (const MachineOperand &Use : VALU->explicit_uses()) {
879*81ad6265SDimitry Andric         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
880*81ad6265SDimitry Andric           return true;
881*81ad6265SDimitry Andric       }
882*81ad6265SDimitry Andric 
883*81ad6265SDimitry Andric       return false;
884*81ad6265SDimitry Andric     };
885*81ad6265SDimitry Andric 
886*81ad6265SDimitry Andric     int WaitStatesNeededForDef =
887*81ad6265SDimitry Andric         TransDefWaitstates -
888*81ad6265SDimitry Andric         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
889*81ad6265SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
890*81ad6265SDimitry Andric   }
891*81ad6265SDimitry Andric 
892*81ad6265SDimitry Andric   if (ST.hasDstSelForwardingHazard()) {
893*81ad6265SDimitry Andric     const int Shift16DefWaitstates = 1;
894*81ad6265SDimitry Andric 
895*81ad6265SDimitry Andric     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
896*81ad6265SDimitry Andric       if (!SIInstrInfo::isVALU(MI))
897*81ad6265SDimitry Andric         return false;
898*81ad6265SDimitry Andric       const SIInstrInfo *TII = ST.getInstrInfo();
899*81ad6265SDimitry Andric       if (SIInstrInfo::isSDWA(MI)) {
900*81ad6265SDimitry Andric         if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
901*81ad6265SDimitry Andric           if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
902*81ad6265SDimitry Andric             return false;
903*81ad6265SDimitry Andric       } else {
904*81ad6265SDimitry Andric         if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
905*81ad6265SDimitry Andric                                         AMDGPU::OpName::op_sel) == -1) ||
906*81ad6265SDimitry Andric             !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
907*81ad6265SDimitry Andric                   ->getImm() &
908*81ad6265SDimitry Andric               SISrcMods::DST_OP_SEL))
909*81ad6265SDimitry Andric           return false;
910*81ad6265SDimitry Andric       }
911*81ad6265SDimitry Andric       const SIRegisterInfo *TRI = ST.getRegisterInfo();
912*81ad6265SDimitry Andric       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
913*81ad6265SDimitry Andric         Register Def = Dst->getReg();
914*81ad6265SDimitry Andric 
915*81ad6265SDimitry Andric         for (const MachineOperand &Use : VALU->explicit_uses()) {
916*81ad6265SDimitry Andric           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
917*81ad6265SDimitry Andric             return true;
918*81ad6265SDimitry Andric         }
919*81ad6265SDimitry Andric       }
920*81ad6265SDimitry Andric 
921*81ad6265SDimitry Andric       return false;
922*81ad6265SDimitry Andric     };
923*81ad6265SDimitry Andric 
924*81ad6265SDimitry Andric     int WaitStatesNeededForDef =
925*81ad6265SDimitry Andric         Shift16DefWaitstates -
926*81ad6265SDimitry Andric         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
927*81ad6265SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
928*81ad6265SDimitry Andric   }
929*81ad6265SDimitry Andric 
930*81ad6265SDimitry Andric   if (ST.hasVDecCoExecHazard()) {
931*81ad6265SDimitry Andric     const int VALUWriteSGPRVALUReadWaitstates = 2;
932*81ad6265SDimitry Andric     const int VALUWriteEXECRWLane = 4;
933*81ad6265SDimitry Andric     const int VALUWriteVGPRReadlaneRead = 1;
934*81ad6265SDimitry Andric 
935*81ad6265SDimitry Andric     const SIRegisterInfo *TRI = ST.getRegisterInfo();
936*81ad6265SDimitry Andric     const MachineRegisterInfo &MRI = MF.getRegInfo();
937*81ad6265SDimitry Andric     Register UseReg;
938*81ad6265SDimitry Andric     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
939*81ad6265SDimitry Andric       if (!SIInstrInfo::isVALU(MI))
940*81ad6265SDimitry Andric         return false;
941*81ad6265SDimitry Andric       return MI.modifiesRegister(UseReg, TRI);
942*81ad6265SDimitry Andric     };
943*81ad6265SDimitry Andric 
944*81ad6265SDimitry Andric     for (const MachineOperand &Use : VALU->explicit_uses()) {
945*81ad6265SDimitry Andric       if (!Use.isReg())
946*81ad6265SDimitry Andric         continue;
947*81ad6265SDimitry Andric 
948*81ad6265SDimitry Andric       UseReg = Use.getReg();
949*81ad6265SDimitry Andric       if (TRI->isSGPRReg(MRI, UseReg)) {
950*81ad6265SDimitry Andric         int WaitStatesNeededForDef =
951*81ad6265SDimitry Andric             VALUWriteSGPRVALUReadWaitstates -
952*81ad6265SDimitry Andric             getWaitStatesSince(IsVALUDefSGPRFn,
953*81ad6265SDimitry Andric                                VALUWriteSGPRVALUReadWaitstates);
954*81ad6265SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
955*81ad6265SDimitry Andric       }
956*81ad6265SDimitry Andric     }
957*81ad6265SDimitry Andric 
958*81ad6265SDimitry Andric     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
959*81ad6265SDimitry Andric       UseReg = AMDGPU::VCC;
960*81ad6265SDimitry Andric       int WaitStatesNeededForDef =
961*81ad6265SDimitry Andric           VALUWriteSGPRVALUReadWaitstates -
962*81ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
963*81ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
964*81ad6265SDimitry Andric     }
965*81ad6265SDimitry Andric 
966*81ad6265SDimitry Andric     switch (VALU->getOpcode()) {
967*81ad6265SDimitry Andric     case AMDGPU::V_READLANE_B32:
968*81ad6265SDimitry Andric     case AMDGPU::V_READFIRSTLANE_B32: {
969*81ad6265SDimitry Andric       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
970*81ad6265SDimitry Andric       UseReg = Src->getReg();
971*81ad6265SDimitry Andric       int WaitStatesNeededForDef =
972*81ad6265SDimitry Andric           VALUWriteVGPRReadlaneRead -
973*81ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
974*81ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
975*81ad6265SDimitry Andric     }
976*81ad6265SDimitry Andric       LLVM_FALLTHROUGH;
977*81ad6265SDimitry Andric     case AMDGPU::V_WRITELANE_B32: {
978*81ad6265SDimitry Andric       UseReg = AMDGPU::EXEC;
979*81ad6265SDimitry Andric       int WaitStatesNeededForDef =
980*81ad6265SDimitry Andric           VALUWriteEXECRWLane -
981*81ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
982*81ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
983*81ad6265SDimitry Andric       break;
984*81ad6265SDimitry Andric     }
985*81ad6265SDimitry Andric     default:
986*81ad6265SDimitry Andric       break;
987*81ad6265SDimitry Andric     }
988*81ad6265SDimitry Andric   }
989*81ad6265SDimitry Andric 
9900b57cec5SDimitry Andric   // This checks for the hazard where VMEM instructions that store more than
9910b57cec5SDimitry Andric   // 8 bytes can have there store data over written by the next instruction.
9920b57cec5SDimitry Andric   if (!ST.has12DWordStoreHazard())
993*81ad6265SDimitry Andric     return WaitStatesNeeded;
9940b57cec5SDimitry Andric 
9950b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
9960b57cec5SDimitry Andric 
9970b57cec5SDimitry Andric   for (const MachineOperand &Def : VALU->defs()) {
9980b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
9990b57cec5SDimitry Andric   }
10000b57cec5SDimitry Andric 
10010b57cec5SDimitry Andric   return WaitStatesNeeded;
10020b57cec5SDimitry Andric }
10030b57cec5SDimitry Andric 
10040b57cec5SDimitry Andric int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10050b57cec5SDimitry Andric   // This checks for hazards associated with inline asm statements.
10060b57cec5SDimitry Andric   // Since inline asms can contain just about anything, we use this
10070b57cec5SDimitry Andric   // to call/leverage other check*Hazard routines. Note that
10080b57cec5SDimitry Andric   // this function doesn't attempt to address all possible inline asm
10090b57cec5SDimitry Andric   // hazards (good luck), but is a collection of what has been
10100b57cec5SDimitry Andric   // problematic thus far.
10110b57cec5SDimitry Andric 
10120b57cec5SDimitry Andric   // see checkVALUHazards()
10130b57cec5SDimitry Andric   if (!ST.has12DWordStoreHazard())
10140b57cec5SDimitry Andric     return 0;
10150b57cec5SDimitry Andric 
10160b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10170b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
10180b57cec5SDimitry Andric 
10190b57cec5SDimitry Andric   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
10200b57cec5SDimitry Andric        I != E; ++I) {
10210b57cec5SDimitry Andric     const MachineOperand &Op = IA->getOperand(I);
10220b57cec5SDimitry Andric     if (Op.isReg() && Op.isDef()) {
10230b57cec5SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
10240b57cec5SDimitry Andric     }
10250b57cec5SDimitry Andric   }
10260b57cec5SDimitry Andric 
10270b57cec5SDimitry Andric   return WaitStatesNeeded;
10280b57cec5SDimitry Andric }
10290b57cec5SDimitry Andric 
10300b57cec5SDimitry Andric int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
10310b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
10320b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
10330b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10340b57cec5SDimitry Andric 
10350b57cec5SDimitry Andric   const MachineOperand *LaneSelectOp =
10360b57cec5SDimitry Andric       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
10370b57cec5SDimitry Andric 
10380b57cec5SDimitry Andric   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
10390b57cec5SDimitry Andric     return 0;
10400b57cec5SDimitry Andric 
10418bcb0991SDimitry Andric   Register LaneSelectReg = LaneSelectOp->getReg();
1042fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
10430b57cec5SDimitry Andric 
10440b57cec5SDimitry Andric   const int RWLaneWaitStates = 4;
10450b57cec5SDimitry Andric   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
10460b57cec5SDimitry Andric                                               RWLaneWaitStates);
10470b57cec5SDimitry Andric   return RWLaneWaitStates - WaitStatesSince;
10480b57cec5SDimitry Andric }
10490b57cec5SDimitry Andric 
10500b57cec5SDimitry Andric int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
10510b57cec5SDimitry Andric   if (!ST.hasRFEHazards())
10520b57cec5SDimitry Andric     return 0;
10530b57cec5SDimitry Andric 
10540b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
10550b57cec5SDimitry Andric 
10560b57cec5SDimitry Andric   const int RFEWaitStates = 1;
10570b57cec5SDimitry Andric 
1058fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) {
1059fe6060f1SDimitry Andric     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
10600b57cec5SDimitry Andric   };
10610b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
10620b57cec5SDimitry Andric   return RFEWaitStates - WaitStatesNeeded;
10630b57cec5SDimitry Andric }
10640b57cec5SDimitry Andric 
10650b57cec5SDimitry Andric int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
10660b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1067*81ad6265SDimitry Andric   const int ReadM0WaitStates = 1;
1068fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1069*81ad6265SDimitry Andric   return ReadM0WaitStates -
1070*81ad6265SDimitry Andric          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
10710b57cec5SDimitry Andric }
10720b57cec5SDimitry Andric 
10730b57cec5SDimitry Andric void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
10740b57cec5SDimitry Andric   fixVMEMtoScalarWriteHazards(MI);
10750b57cec5SDimitry Andric   fixVcmpxPermlaneHazards(MI);
10760b57cec5SDimitry Andric   fixSMEMtoVectorWriteHazards(MI);
10770b57cec5SDimitry Andric   fixVcmpxExecWARHazard(MI);
10780b57cec5SDimitry Andric   fixLdsBranchVmemWARHazard(MI);
1079*81ad6265SDimitry Andric   if (ST.hasLdsDirect()) {
1080*81ad6265SDimitry Andric     fixLdsDirectVALUHazard(MI);
1081*81ad6265SDimitry Andric     fixLdsDirectVMEMHazard(MI);
1082*81ad6265SDimitry Andric   }
1083*81ad6265SDimitry Andric   fixVALUPartialForwardingHazard(MI);
1084*81ad6265SDimitry Andric   fixVALUTransUseHazard(MI);
1085*81ad6265SDimitry Andric   fixWMMAHazards(MI);
10860b57cec5SDimitry Andric }
10870b57cec5SDimitry Andric 
10880b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
10890b57cec5SDimitry Andric   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
10900b57cec5SDimitry Andric     return false;
10910b57cec5SDimitry Andric 
10920b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1093*81ad6265SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1094*81ad6265SDimitry Andric   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1095*81ad6265SDimitry Andric     return (TII->isVOPC(MI) ||
1096*81ad6265SDimitry Andric             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1097*81ad6265SDimitry Andric            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1098*81ad6265SDimitry Andric   };
10990b57cec5SDimitry Andric 
1100fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1101fe6060f1SDimitry Andric     unsigned Opc = MI.getOpcode();
1102fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1103fe6060f1SDimitry Andric            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
11040b57cec5SDimitry Andric   };
11050b57cec5SDimitry Andric 
11060b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11070b57cec5SDimitry Andric       std::numeric_limits<int>::max())
11080b57cec5SDimitry Andric     return false;
11090b57cec5SDimitry Andric 
11100b57cec5SDimitry Andric   // V_NOP will be discarded by SQ.
1111*81ad6265SDimitry Andric   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
11120b57cec5SDimitry Andric   // which is always a VGPR and available.
11130b57cec5SDimitry Andric   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
11148bcb0991SDimitry Andric   Register Reg = Src0->getReg();
11150b57cec5SDimitry Andric   bool IsUndef = Src0->isUndef();
11160b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
11170b57cec5SDimitry Andric           TII->get(AMDGPU::V_MOV_B32_e32))
11180b57cec5SDimitry Andric     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
11190b57cec5SDimitry Andric     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
11200b57cec5SDimitry Andric 
11210b57cec5SDimitry Andric   return true;
11220b57cec5SDimitry Andric }
11230b57cec5SDimitry Andric 
11240b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
11250b57cec5SDimitry Andric   if (!ST.hasVMEMtoScalarWriteHazard())
11260b57cec5SDimitry Andric     return false;
11270b57cec5SDimitry Andric 
11280b57cec5SDimitry Andric   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
11290b57cec5SDimitry Andric     return false;
11300b57cec5SDimitry Andric 
11310b57cec5SDimitry Andric   if (MI->getNumDefs() == 0)
11320b57cec5SDimitry Andric     return false;
11330b57cec5SDimitry Andric 
11340b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
11350b57cec5SDimitry Andric 
1136fe6060f1SDimitry Andric   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1137fe6060f1SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1138fe6060f1SDimitry Andric         !SIInstrInfo::isFLAT(I))
11390b57cec5SDimitry Andric       return false;
11400b57cec5SDimitry Andric 
11410b57cec5SDimitry Andric     for (const MachineOperand &Def : MI->defs()) {
1142fe6060f1SDimitry Andric       const MachineOperand *Op =
1143fe6060f1SDimitry Andric           I.findRegisterUseOperand(Def.getReg(), false, TRI);
11440b57cec5SDimitry Andric       if (!Op)
11450b57cec5SDimitry Andric         continue;
11460b57cec5SDimitry Andric       return true;
11470b57cec5SDimitry Andric     }
11480b57cec5SDimitry Andric     return false;
11490b57cec5SDimitry Andric   };
11500b57cec5SDimitry Andric 
1151fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1152fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI) ||
1153fe6060f1SDimitry Andric            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1154fe6060f1SDimitry Andric             !MI.getOperand(0).getImm()) ||
1155fe6060f1SDimitry Andric            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1156fe6060f1SDimitry Andric             MI.getOperand(0).getImm() == 0xffe3);
11570b57cec5SDimitry Andric   };
11580b57cec5SDimitry Andric 
11590b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11600b57cec5SDimitry Andric       std::numeric_limits<int>::max())
11610b57cec5SDimitry Andric     return false;
11620b57cec5SDimitry Andric 
11630b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1164e8d8bef9SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1165e8d8bef9SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1166e8d8bef9SDimitry Andric       .addImm(0xffe3);
11670b57cec5SDimitry Andric   return true;
11680b57cec5SDimitry Andric }
11690b57cec5SDimitry Andric 
11700b57cec5SDimitry Andric bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
11710b57cec5SDimitry Andric   if (!ST.hasSMEMtoVectorWriteHazard())
11720b57cec5SDimitry Andric     return false;
11730b57cec5SDimitry Andric 
11740b57cec5SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
11750b57cec5SDimitry Andric     return false;
11760b57cec5SDimitry Andric 
11770b57cec5SDimitry Andric   unsigned SDSTName;
11780b57cec5SDimitry Andric   switch (MI->getOpcode()) {
11790b57cec5SDimitry Andric   case AMDGPU::V_READLANE_B32:
11800b57cec5SDimitry Andric   case AMDGPU::V_READFIRSTLANE_B32:
11810b57cec5SDimitry Andric     SDSTName = AMDGPU::OpName::vdst;
11820b57cec5SDimitry Andric     break;
11830b57cec5SDimitry Andric   default:
11840b57cec5SDimitry Andric     SDSTName = AMDGPU::OpName::sdst;
11850b57cec5SDimitry Andric     break;
11860b57cec5SDimitry Andric   }
11870b57cec5SDimitry Andric 
11880b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
11890b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
11900b57cec5SDimitry Andric   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
11910b57cec5SDimitry Andric   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
11920b57cec5SDimitry Andric   if (!SDST) {
11930b57cec5SDimitry Andric     for (const auto &MO : MI->implicit_operands()) {
11940b57cec5SDimitry Andric       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
11950b57cec5SDimitry Andric         SDST = &MO;
11960b57cec5SDimitry Andric         break;
11970b57cec5SDimitry Andric       }
11980b57cec5SDimitry Andric     }
11990b57cec5SDimitry Andric   }
12000b57cec5SDimitry Andric 
12010b57cec5SDimitry Andric   if (!SDST)
12020b57cec5SDimitry Andric     return false;
12030b57cec5SDimitry Andric 
12048bcb0991SDimitry Andric   const Register SDSTReg = SDST->getReg();
1205fe6060f1SDimitry Andric   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1206fe6060f1SDimitry Andric     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
12070b57cec5SDimitry Andric   };
12080b57cec5SDimitry Andric 
1209fe6060f1SDimitry Andric   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1210fe6060f1SDimitry Andric     if (TII->isSALU(MI)) {
1211fe6060f1SDimitry Andric       switch (MI.getOpcode()) {
12120b57cec5SDimitry Andric       case AMDGPU::S_SETVSKIP:
12130b57cec5SDimitry Andric       case AMDGPU::S_VERSION:
12140b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_VSCNT:
12150b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_VMCNT:
12160b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_EXPCNT:
12170b57cec5SDimitry Andric         // These instructions cannot not mitigate the hazard.
12180b57cec5SDimitry Andric         return false;
12190b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_LGKMCNT:
12200b57cec5SDimitry Andric         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1221fe6060f1SDimitry Andric         return (MI.getOperand(1).getImm() == 0) &&
1222fe6060f1SDimitry Andric                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
12230b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT: {
1224fe6060f1SDimitry Andric         const int64_t Imm = MI.getOperand(0).getImm();
12250b57cec5SDimitry Andric         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
12260b57cec5SDimitry Andric         return (Decoded.LgkmCnt == 0);
12270b57cec5SDimitry Andric       }
12280b57cec5SDimitry Andric       default:
12290b57cec5SDimitry Andric         // SOPP instructions cannot mitigate the hazard.
1230fe6060f1SDimitry Andric         if (TII->isSOPP(MI))
12310b57cec5SDimitry Andric           return false;
12320b57cec5SDimitry Andric         // At this point the SALU can be assumed to mitigate the hazard
12330b57cec5SDimitry Andric         // because either:
12340b57cec5SDimitry Andric         // (a) it is independent of the at risk SMEM (breaking chain),
12350b57cec5SDimitry Andric         // or
12360b57cec5SDimitry Andric         // (b) it is dependent on the SMEM, in which case an appropriate
12370b57cec5SDimitry Andric         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
12380b57cec5SDimitry Andric         //     SMEM instruction.
12390b57cec5SDimitry Andric         return true;
12400b57cec5SDimitry Andric       }
12410b57cec5SDimitry Andric     }
12420b57cec5SDimitry Andric     return false;
12430b57cec5SDimitry Andric   };
12440b57cec5SDimitry Andric 
12450b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
12460b57cec5SDimitry Andric       std::numeric_limits<int>::max())
12470b57cec5SDimitry Andric     return false;
12480b57cec5SDimitry Andric 
12490b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
12500b57cec5SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
12510b57cec5SDimitry Andric       .addImm(0);
12520b57cec5SDimitry Andric   return true;
12530b57cec5SDimitry Andric }
12540b57cec5SDimitry Andric 
12550b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
12560b57cec5SDimitry Andric   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
12570b57cec5SDimitry Andric     return false;
12580b57cec5SDimitry Andric 
12590b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
12600b57cec5SDimitry Andric   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
12610b57cec5SDimitry Andric     return false;
12620b57cec5SDimitry Andric 
1263fe6060f1SDimitry Andric   auto IsHazardFn = [TRI](const MachineInstr &I) {
1264fe6060f1SDimitry Andric     if (SIInstrInfo::isVALU(I))
12650b57cec5SDimitry Andric       return false;
1266fe6060f1SDimitry Andric     return I.readsRegister(AMDGPU::EXEC, TRI);
12670b57cec5SDimitry Andric   };
12680b57cec5SDimitry Andric 
12690b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1270fe6060f1SDimitry Andric   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1271fe6060f1SDimitry Andric     if (SIInstrInfo::isVALU(MI)) {
1272fe6060f1SDimitry Andric       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
12730b57cec5SDimitry Andric         return true;
1274fe6060f1SDimitry Andric       for (auto MO : MI.implicit_operands())
12750b57cec5SDimitry Andric         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
12760b57cec5SDimitry Andric           return true;
12770b57cec5SDimitry Andric     }
1278fe6060f1SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1279fe6060f1SDimitry Andric         (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
12800b57cec5SDimitry Andric       return true;
12810b57cec5SDimitry Andric     return false;
12820b57cec5SDimitry Andric   };
12830b57cec5SDimitry Andric 
12840b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
12850b57cec5SDimitry Andric       std::numeric_limits<int>::max())
12860b57cec5SDimitry Andric     return false;
12870b57cec5SDimitry Andric 
12880b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
12890b57cec5SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
12900b57cec5SDimitry Andric     .addImm(0xfffe);
12910b57cec5SDimitry Andric   return true;
12920b57cec5SDimitry Andric }
12930b57cec5SDimitry Andric 
1294fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1295fe6060f1SDimitry Andric                                                  const GCNSubtarget &ST) {
12960b57cec5SDimitry Andric   if (!ST.hasLdsBranchVmemWARHazard())
12970b57cec5SDimitry Andric     return false;
12980b57cec5SDimitry Andric 
1299fe6060f1SDimitry Andric   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1300fe6060f1SDimitry Andric   // instructions need to appear in the same function.
1301fe6060f1SDimitry Andric   bool HasLds = false;
1302fe6060f1SDimitry Andric   bool HasVmem = false;
1303fe6060f1SDimitry Andric   for (auto &MBB : MF) {
1304fe6060f1SDimitry Andric     for (auto &MI : MBB) {
1305fe6060f1SDimitry Andric       HasLds |= SIInstrInfo::isDS(MI);
1306fe6060f1SDimitry Andric       HasVmem |=
1307fe6060f1SDimitry Andric           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1308fe6060f1SDimitry Andric       if (HasLds && HasVmem)
1309fe6060f1SDimitry Andric         return true;
1310fe6060f1SDimitry Andric     }
1311fe6060f1SDimitry Andric   }
1312fe6060f1SDimitry Andric   return false;
1313fe6060f1SDimitry Andric }
1314fe6060f1SDimitry Andric 
1315fe6060f1SDimitry Andric bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1316fe6060f1SDimitry Andric   if (!RunLdsBranchVmemWARHazardFixup)
1317fe6060f1SDimitry Andric     return false;
1318fe6060f1SDimitry Andric 
1319fe6060f1SDimitry Andric   assert(ST.hasLdsBranchVmemWARHazard());
1320fe6060f1SDimitry Andric 
1321fe6060f1SDimitry Andric   auto IsHazardInst = [](const MachineInstr &MI) {
1322fe6060f1SDimitry Andric     if (SIInstrInfo::isDS(MI))
13230b57cec5SDimitry Andric       return 1;
1324fe6060f1SDimitry Andric     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
13250b57cec5SDimitry Andric       return 2;
13260b57cec5SDimitry Andric     return 0;
13270b57cec5SDimitry Andric   };
13280b57cec5SDimitry Andric 
1329fe6060f1SDimitry Andric   auto InstType = IsHazardInst(*MI);
13300b57cec5SDimitry Andric   if (!InstType)
13310b57cec5SDimitry Andric     return false;
13320b57cec5SDimitry Andric 
1333fe6060f1SDimitry Andric   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1334fe6060f1SDimitry Andric     return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1335fe6060f1SDimitry Andric                                I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1336fe6060f1SDimitry Andric                                !I.getOperand(1).getImm());
13370b57cec5SDimitry Andric   };
13380b57cec5SDimitry Andric 
1339fe6060f1SDimitry Andric   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1340fe6060f1SDimitry Andric     if (!I.isBranch())
13410b57cec5SDimitry Andric       return false;
13420b57cec5SDimitry Andric 
1343fe6060f1SDimitry Andric     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
13440b57cec5SDimitry Andric       auto InstType2 = IsHazardInst(I);
13450b57cec5SDimitry Andric       return InstType2 && InstType != InstType2;
13460b57cec5SDimitry Andric     };
13470b57cec5SDimitry Andric 
1348fe6060f1SDimitry Andric     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
13490b57cec5SDimitry Andric       auto InstType2 = IsHazardInst(I);
13500b57cec5SDimitry Andric       if (InstType == InstType2)
13510b57cec5SDimitry Andric         return true;
13520b57cec5SDimitry Andric 
1353fe6060f1SDimitry Andric       return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1354fe6060f1SDimitry Andric              I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1355fe6060f1SDimitry Andric              !I.getOperand(1).getImm();
13560b57cec5SDimitry Andric     };
13570b57cec5SDimitry Andric 
1358fe6060f1SDimitry Andric     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
13590b57cec5SDimitry Andric            std::numeric_limits<int>::max();
13600b57cec5SDimitry Andric   };
13610b57cec5SDimitry Andric 
13620b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13630b57cec5SDimitry Andric       std::numeric_limits<int>::max())
13640b57cec5SDimitry Andric     return false;
13650b57cec5SDimitry Andric 
13660b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
13670b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
13680b57cec5SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_VSCNT))
13690b57cec5SDimitry Andric     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
13700b57cec5SDimitry Andric     .addImm(0);
13710b57cec5SDimitry Andric 
13720b57cec5SDimitry Andric   return true;
13730b57cec5SDimitry Andric }
13740b57cec5SDimitry Andric 
1375*81ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1376*81ad6265SDimitry Andric   if (!SIInstrInfo::isLDSDIR(*MI))
1377*81ad6265SDimitry Andric     return false;
1378*81ad6265SDimitry Andric 
1379*81ad6265SDimitry Andric   const int NoHazardWaitStates = 15;
1380*81ad6265SDimitry Andric   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1381*81ad6265SDimitry Andric   const Register VDSTReg = VDST->getReg();
1382*81ad6265SDimitry Andric 
1383*81ad6265SDimitry Andric   bool VisitedTrans = false;
1384*81ad6265SDimitry Andric   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1385*81ad6265SDimitry Andric     if (!SIInstrInfo::isVALU(I))
1386*81ad6265SDimitry Andric       return false;
1387*81ad6265SDimitry Andric     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1388*81ad6265SDimitry Andric     // Cover both WAR and WAW
1389*81ad6265SDimitry Andric     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1390*81ad6265SDimitry Andric   };
1391*81ad6265SDimitry Andric   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1392*81ad6265SDimitry Andric     if (WaitStates >= NoHazardWaitStates)
1393*81ad6265SDimitry Andric       return true;
1394*81ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
1395*81ad6265SDimitry Andric     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1396*81ad6265SDimitry Andric            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1397*81ad6265SDimitry Andric   };
1398*81ad6265SDimitry Andric   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1399*81ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1400*81ad6265SDimitry Andric   };
1401*81ad6265SDimitry Andric 
1402*81ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
1403*81ad6265SDimitry Andric   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1404*81ad6265SDimitry Andric                                     std::next(MI->getReverseIterator()), 0,
1405*81ad6265SDimitry Andric                                     IsExpiredFn, Visited, GetWaitStatesFn);
1406*81ad6265SDimitry Andric 
1407*81ad6265SDimitry Andric   // Transcendentals can execute in parallel to other VALUs.
1408*81ad6265SDimitry Andric   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1409*81ad6265SDimitry Andric   if (VisitedTrans)
1410*81ad6265SDimitry Andric     Count = 0;
1411*81ad6265SDimitry Andric 
1412*81ad6265SDimitry Andric   MachineOperand *WaitVdstOp =
1413*81ad6265SDimitry Andric       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1414*81ad6265SDimitry Andric   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1415*81ad6265SDimitry Andric 
1416*81ad6265SDimitry Andric   return true;
1417*81ad6265SDimitry Andric }
1418*81ad6265SDimitry Andric 
1419*81ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1420*81ad6265SDimitry Andric   if (!SIInstrInfo::isLDSDIR(*MI))
1421*81ad6265SDimitry Andric     return false;
1422*81ad6265SDimitry Andric 
1423*81ad6265SDimitry Andric   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1424*81ad6265SDimitry Andric   const Register VDSTReg = VDST->getReg();
1425*81ad6265SDimitry Andric 
1426*81ad6265SDimitry Andric   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1427*81ad6265SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1428*81ad6265SDimitry Andric         !SIInstrInfo::isDS(I))
1429*81ad6265SDimitry Andric       return false;
1430*81ad6265SDimitry Andric     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1431*81ad6265SDimitry Andric   };
1432*81ad6265SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &I, int) {
1433*81ad6265SDimitry Andric     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1434*81ad6265SDimitry Andric            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1435*81ad6265SDimitry Andric            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1436*81ad6265SDimitry Andric             I.getOperand(0).getImm() == 0xffe3);
1437*81ad6265SDimitry Andric   };
1438*81ad6265SDimitry Andric 
1439*81ad6265SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1440*81ad6265SDimitry Andric       std::numeric_limits<int>::max())
1441*81ad6265SDimitry Andric     return false;
1442*81ad6265SDimitry Andric 
1443*81ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1444*81ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1445*81ad6265SDimitry Andric       .addImm(0xffe3);
1446*81ad6265SDimitry Andric 
1447*81ad6265SDimitry Andric   return true;
1448*81ad6265SDimitry Andric }
1449*81ad6265SDimitry Andric 
1450*81ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1451*81ad6265SDimitry Andric   if (!ST.isWave64())
1452*81ad6265SDimitry Andric     return false;
1453*81ad6265SDimitry Andric   if (!ST.hasVALUPartialForwardingHazard())
1454*81ad6265SDimitry Andric     return false;
1455*81ad6265SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
1456*81ad6265SDimitry Andric     return false;
1457*81ad6265SDimitry Andric 
1458*81ad6265SDimitry Andric   SmallSetVector<Register, 4> SrcVGPRs;
1459*81ad6265SDimitry Andric 
1460*81ad6265SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
1461*81ad6265SDimitry Andric     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1462*81ad6265SDimitry Andric       SrcVGPRs.insert(Use.getReg());
1463*81ad6265SDimitry Andric   }
1464*81ad6265SDimitry Andric 
1465*81ad6265SDimitry Andric   // Only applies with >= 2 unique VGPR sources
1466*81ad6265SDimitry Andric   if (SrcVGPRs.size() <= 1)
1467*81ad6265SDimitry Andric     return false;
1468*81ad6265SDimitry Andric 
1469*81ad6265SDimitry Andric   // Look for the following pattern:
1470*81ad6265SDimitry Andric   //   Va <- VALU [PreExecPos]
1471*81ad6265SDimitry Andric   //   intv1
1472*81ad6265SDimitry Andric   //   Exec <- SALU [ExecPos]
1473*81ad6265SDimitry Andric   //   intv2
1474*81ad6265SDimitry Andric   //   Vb <- VALU [PostExecPos]
1475*81ad6265SDimitry Andric   //   intv3
1476*81ad6265SDimitry Andric   //   MI Va, Vb (WaitState = 0)
1477*81ad6265SDimitry Andric   //
1478*81ad6265SDimitry Andric   // Where:
1479*81ad6265SDimitry Andric   // intv1 + intv2 <= 2 VALUs
1480*81ad6265SDimitry Andric   // intv3 <= 4 VALUs
1481*81ad6265SDimitry Andric   //
1482*81ad6265SDimitry Andric   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1483*81ad6265SDimitry Andric 
1484*81ad6265SDimitry Andric   const int Intv1plus2MaxVALUs = 2;
1485*81ad6265SDimitry Andric   const int Intv3MaxVALUs = 4;
1486*81ad6265SDimitry Andric   const int IntvMaxVALUs = 6;
1487*81ad6265SDimitry Andric   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1488*81ad6265SDimitry Andric 
1489*81ad6265SDimitry Andric   struct StateType {
1490*81ad6265SDimitry Andric     SmallDenseMap<Register, int, 4> DefPos;
1491*81ad6265SDimitry Andric     int ExecPos = std::numeric_limits<int>::max();
1492*81ad6265SDimitry Andric     int VALUs = 0;
1493*81ad6265SDimitry Andric   };
1494*81ad6265SDimitry Andric 
1495*81ad6265SDimitry Andric   StateType State;
1496*81ad6265SDimitry Andric 
1497*81ad6265SDimitry Andric   // This overloads expiry testing with all the hazard detection
1498*81ad6265SDimitry Andric   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1499*81ad6265SDimitry Andric     // Too many VALU states have passed
1500*81ad6265SDimitry Andric     if (State.VALUs > NoHazardVALUWaitStates)
1501*81ad6265SDimitry Andric       return HazardExpired;
1502*81ad6265SDimitry Andric 
1503*81ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
1504*81ad6265SDimitry Andric     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1505*81ad6265SDimitry Andric         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1506*81ad6265SDimitry Andric         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1507*81ad6265SDimitry Andric          I.getOperand(0).getImm() == 0x0fff))
1508*81ad6265SDimitry Andric       return HazardExpired;
1509*81ad6265SDimitry Andric 
1510*81ad6265SDimitry Andric     // Track registers writes
1511*81ad6265SDimitry Andric     bool Changed = false;
1512*81ad6265SDimitry Andric     if (SIInstrInfo::isVALU(I)) {
1513*81ad6265SDimitry Andric       for (Register Src : SrcVGPRs) {
1514*81ad6265SDimitry Andric         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1515*81ad6265SDimitry Andric           State.DefPos[Src] = State.VALUs;
1516*81ad6265SDimitry Andric           Changed = true;
1517*81ad6265SDimitry Andric         }
1518*81ad6265SDimitry Andric       }
1519*81ad6265SDimitry Andric     } else if (SIInstrInfo::isSALU(I)) {
1520*81ad6265SDimitry Andric       if (State.ExecPos == std::numeric_limits<int>::max()) {
1521*81ad6265SDimitry Andric         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1522*81ad6265SDimitry Andric           State.ExecPos = State.VALUs;
1523*81ad6265SDimitry Andric           Changed = true;
1524*81ad6265SDimitry Andric         }
1525*81ad6265SDimitry Andric       }
1526*81ad6265SDimitry Andric     }
1527*81ad6265SDimitry Andric 
1528*81ad6265SDimitry Andric     // Early expiration: too many VALUs in intv3
1529*81ad6265SDimitry Andric     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1530*81ad6265SDimitry Andric       return HazardExpired;
1531*81ad6265SDimitry Andric 
1532*81ad6265SDimitry Andric     // Only evaluate state if something changed
1533*81ad6265SDimitry Andric     if (!Changed)
1534*81ad6265SDimitry Andric       return NoHazardFound;
1535*81ad6265SDimitry Andric 
1536*81ad6265SDimitry Andric     // Determine positions of VALUs pre/post exec change
1537*81ad6265SDimitry Andric     if (State.ExecPos == std::numeric_limits<int>::max())
1538*81ad6265SDimitry Andric       return NoHazardFound;
1539*81ad6265SDimitry Andric 
1540*81ad6265SDimitry Andric     int PreExecPos = std::numeric_limits<int>::max();
1541*81ad6265SDimitry Andric     int PostExecPos = std::numeric_limits<int>::max();
1542*81ad6265SDimitry Andric 
1543*81ad6265SDimitry Andric     for (auto Entry : State.DefPos) {
1544*81ad6265SDimitry Andric       int DefVALUs = Entry.second;
1545*81ad6265SDimitry Andric       if (DefVALUs != std::numeric_limits<int>::max()) {
1546*81ad6265SDimitry Andric         if (DefVALUs >= State.ExecPos)
1547*81ad6265SDimitry Andric           PreExecPos = std::min(PreExecPos, DefVALUs);
1548*81ad6265SDimitry Andric         else if (DefVALUs < State.ExecPos)
1549*81ad6265SDimitry Andric           PostExecPos = std::min(PostExecPos, DefVALUs);
1550*81ad6265SDimitry Andric       }
1551*81ad6265SDimitry Andric     }
1552*81ad6265SDimitry Andric 
1553*81ad6265SDimitry Andric     // Need a VALUs post exec change
1554*81ad6265SDimitry Andric     if (PostExecPos == std::numeric_limits<int>::max())
1555*81ad6265SDimitry Andric       return NoHazardFound;
1556*81ad6265SDimitry Andric 
1557*81ad6265SDimitry Andric     // Too many VALUs in intv3?
1558*81ad6265SDimitry Andric     int Intv3VALUs = PostExecPos;
1559*81ad6265SDimitry Andric     if (Intv3VALUs > Intv3MaxVALUs)
1560*81ad6265SDimitry Andric       return HazardExpired;
1561*81ad6265SDimitry Andric 
1562*81ad6265SDimitry Andric     // Too many VALUs in intv2?
1563*81ad6265SDimitry Andric     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1564*81ad6265SDimitry Andric     if (Intv2VALUs > Intv1plus2MaxVALUs)
1565*81ad6265SDimitry Andric       return HazardExpired;
1566*81ad6265SDimitry Andric 
1567*81ad6265SDimitry Andric     // Need a VALUs pre exec change
1568*81ad6265SDimitry Andric     if (PreExecPos == std::numeric_limits<int>::max())
1569*81ad6265SDimitry Andric       return NoHazardFound;
1570*81ad6265SDimitry Andric 
1571*81ad6265SDimitry Andric     // Too many VALUs in intv1?
1572*81ad6265SDimitry Andric     int Intv1VALUs = PreExecPos - State.ExecPos;
1573*81ad6265SDimitry Andric     if (Intv1VALUs > Intv1plus2MaxVALUs)
1574*81ad6265SDimitry Andric       return HazardExpired;
1575*81ad6265SDimitry Andric 
1576*81ad6265SDimitry Andric     // Too many VALUs in intv1 + intv2
1577*81ad6265SDimitry Andric     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1578*81ad6265SDimitry Andric       return HazardExpired;
1579*81ad6265SDimitry Andric 
1580*81ad6265SDimitry Andric     return HazardFound;
1581*81ad6265SDimitry Andric   };
1582*81ad6265SDimitry Andric   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1583*81ad6265SDimitry Andric     if (SIInstrInfo::isVALU(MI))
1584*81ad6265SDimitry Andric       State.VALUs += 1;
1585*81ad6265SDimitry Andric   };
1586*81ad6265SDimitry Andric 
1587*81ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
1588*81ad6265SDimitry Andric   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1589*81ad6265SDimitry Andric                             std::next(MI->getReverseIterator()), Visited))
1590*81ad6265SDimitry Andric     return false;
1591*81ad6265SDimitry Andric 
1592*81ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1593*81ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1594*81ad6265SDimitry Andric       .addImm(0x0fff);
1595*81ad6265SDimitry Andric 
1596*81ad6265SDimitry Andric   return true;
1597*81ad6265SDimitry Andric }
1598*81ad6265SDimitry Andric 
1599*81ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1600*81ad6265SDimitry Andric   if (!ST.hasVALUTransUseHazard())
1601*81ad6265SDimitry Andric     return false;
1602*81ad6265SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
1603*81ad6265SDimitry Andric     return false;
1604*81ad6265SDimitry Andric 
1605*81ad6265SDimitry Andric   SmallSet<Register, 4> SrcVGPRs;
1606*81ad6265SDimitry Andric 
1607*81ad6265SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
1608*81ad6265SDimitry Andric     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1609*81ad6265SDimitry Andric       SrcVGPRs.insert(Use.getReg());
1610*81ad6265SDimitry Andric   }
1611*81ad6265SDimitry Andric 
1612*81ad6265SDimitry Andric   // Look for the following pattern:
1613*81ad6265SDimitry Andric   //   Va <- TRANS VALU
1614*81ad6265SDimitry Andric   //   intv
1615*81ad6265SDimitry Andric   //   MI Va (WaitState = 0)
1616*81ad6265SDimitry Andric   //
1617*81ad6265SDimitry Andric   // Where:
1618*81ad6265SDimitry Andric   // intv <= 5 VALUs / 1 TRANS
1619*81ad6265SDimitry Andric   //
1620*81ad6265SDimitry Andric   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1621*81ad6265SDimitry Andric 
1622*81ad6265SDimitry Andric   const int IntvMaxVALUs = 5;
1623*81ad6265SDimitry Andric   const int IntvMaxTRANS = 1;
1624*81ad6265SDimitry Andric 
1625*81ad6265SDimitry Andric   struct StateType {
1626*81ad6265SDimitry Andric     int VALUs = 0;
1627*81ad6265SDimitry Andric     int TRANS = 0;
1628*81ad6265SDimitry Andric   };
1629*81ad6265SDimitry Andric 
1630*81ad6265SDimitry Andric   StateType State;
1631*81ad6265SDimitry Andric 
1632*81ad6265SDimitry Andric   // This overloads expiry testing with all the hazard detection
1633*81ad6265SDimitry Andric   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1634*81ad6265SDimitry Andric     // Too many VALU states have passed
1635*81ad6265SDimitry Andric     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1636*81ad6265SDimitry Andric       return HazardExpired;
1637*81ad6265SDimitry Andric 
1638*81ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
1639*81ad6265SDimitry Andric     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1640*81ad6265SDimitry Andric         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1641*81ad6265SDimitry Andric         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1642*81ad6265SDimitry Andric          I.getOperand(0).getImm() == 0x0fff))
1643*81ad6265SDimitry Andric       return HazardExpired;
1644*81ad6265SDimitry Andric 
1645*81ad6265SDimitry Andric     // Track registers writes
1646*81ad6265SDimitry Andric     if (SIInstrInfo::isTRANS(I)) {
1647*81ad6265SDimitry Andric       for (Register Src : SrcVGPRs) {
1648*81ad6265SDimitry Andric         if (I.modifiesRegister(Src, &TRI)) {
1649*81ad6265SDimitry Andric           return HazardFound;
1650*81ad6265SDimitry Andric         }
1651*81ad6265SDimitry Andric       }
1652*81ad6265SDimitry Andric     }
1653*81ad6265SDimitry Andric 
1654*81ad6265SDimitry Andric     return NoHazardFound;
1655*81ad6265SDimitry Andric   };
1656*81ad6265SDimitry Andric   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1657*81ad6265SDimitry Andric     if (SIInstrInfo::isVALU(MI))
1658*81ad6265SDimitry Andric       State.VALUs += 1;
1659*81ad6265SDimitry Andric     if (SIInstrInfo::isTRANS(MI))
1660*81ad6265SDimitry Andric       State.TRANS += 1;
1661*81ad6265SDimitry Andric   };
1662*81ad6265SDimitry Andric 
1663*81ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
1664*81ad6265SDimitry Andric   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1665*81ad6265SDimitry Andric                             std::next(MI->getReverseIterator()), Visited))
1666*81ad6265SDimitry Andric     return false;
1667*81ad6265SDimitry Andric 
1668*81ad6265SDimitry Andric   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1669*81ad6265SDimitry Andric   // avoided (mask 0x0fff achieves this).
1670*81ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1671*81ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1672*81ad6265SDimitry Andric       .addImm(0x0fff);
1673*81ad6265SDimitry Andric 
1674*81ad6265SDimitry Andric   return true;
1675*81ad6265SDimitry Andric }
1676*81ad6265SDimitry Andric 
1677*81ad6265SDimitry Andric bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1678*81ad6265SDimitry Andric   if (!SIInstrInfo::isWMMA(*MI))
1679*81ad6265SDimitry Andric     return false;
1680*81ad6265SDimitry Andric 
1681*81ad6265SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1682*81ad6265SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1683*81ad6265SDimitry Andric 
1684*81ad6265SDimitry Andric   auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1685*81ad6265SDimitry Andric     if (!SIInstrInfo::isWMMA(I))
1686*81ad6265SDimitry Andric       return false;
1687*81ad6265SDimitry Andric 
1688*81ad6265SDimitry Andric     // Src0 or Src1 of the current wmma instruction overlaps with the dest of
1689*81ad6265SDimitry Andric     // the previous wmma.
1690*81ad6265SDimitry Andric     const Register CurSrc0Reg =
1691*81ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1692*81ad6265SDimitry Andric     const Register CurSrc1Reg =
1693*81ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1694*81ad6265SDimitry Andric 
1695*81ad6265SDimitry Andric     const Register PrevDstReg =
1696*81ad6265SDimitry Andric         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1697*81ad6265SDimitry Andric 
1698*81ad6265SDimitry Andric     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1699*81ad6265SDimitry Andric         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1700*81ad6265SDimitry Andric       return true;
1701*81ad6265SDimitry Andric     }
1702*81ad6265SDimitry Andric 
1703*81ad6265SDimitry Andric     // Src2 of the current wmma instruction overlaps with the dest of the
1704*81ad6265SDimitry Andric     // previous wmma.
1705*81ad6265SDimitry Andric     const MachineOperand *Src2 =
1706*81ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1707*81ad6265SDimitry Andric     const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
1708*81ad6265SDimitry Andric 
1709*81ad6265SDimitry Andric     if (CurSrc2Reg != AMDGPU::NoRegister &&
1710*81ad6265SDimitry Andric         TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
1711*81ad6265SDimitry Andric 
1712*81ad6265SDimitry Andric       const MachineOperand *Src2Mods =
1713*81ad6265SDimitry Andric           TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
1714*81ad6265SDimitry Andric       const bool NoSrc2Mods =
1715*81ad6265SDimitry Andric           (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
1716*81ad6265SDimitry Andric       // Exception: there is no hazard if the wmma instructions are of the same
1717*81ad6265SDimitry Andric       // type and there is no input modifier on src2 of the current instruction.
1718*81ad6265SDimitry Andric       return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
1719*81ad6265SDimitry Andric                               TII->pseudoToMCOpcode(MI->getOpcode())));
1720*81ad6265SDimitry Andric     }
1721*81ad6265SDimitry Andric 
1722*81ad6265SDimitry Andric     return false;
1723*81ad6265SDimitry Andric   };
1724*81ad6265SDimitry Andric 
1725*81ad6265SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &I, int) {
1726*81ad6265SDimitry Andric     return SIInstrInfo::isVALU(I);
1727*81ad6265SDimitry Andric   };
1728*81ad6265SDimitry Andric 
1729*81ad6265SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1730*81ad6265SDimitry Andric       std::numeric_limits<int>::max())
1731*81ad6265SDimitry Andric     return false;
1732*81ad6265SDimitry Andric 
1733*81ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1734*81ad6265SDimitry Andric 
1735*81ad6265SDimitry Andric   return true;
1736*81ad6265SDimitry Andric }
1737*81ad6265SDimitry Andric 
17380b57cec5SDimitry Andric int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
17390b57cec5SDimitry Andric   int NSAtoVMEMWaitStates = 1;
17400b57cec5SDimitry Andric 
17410b57cec5SDimitry Andric   if (!ST.hasNSAtoVMEMBug())
17420b57cec5SDimitry Andric     return 0;
17430b57cec5SDimitry Andric 
17440b57cec5SDimitry Andric   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
17450b57cec5SDimitry Andric     return 0;
17460b57cec5SDimitry Andric 
17470b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
17480b57cec5SDimitry Andric   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
17490b57cec5SDimitry Andric   if (!Offset || (Offset->getImm() & 6) == 0)
17500b57cec5SDimitry Andric     return 0;
17510b57cec5SDimitry Andric 
1752fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &I) {
1753fe6060f1SDimitry Andric     if (!SIInstrInfo::isMIMG(I))
17540b57cec5SDimitry Andric       return false;
1755fe6060f1SDimitry Andric     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
17560b57cec5SDimitry Andric     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1757fe6060f1SDimitry Andric            TII->getInstSizeInBytes(I) >= 16;
17580b57cec5SDimitry Andric   };
17590b57cec5SDimitry Andric 
17600b57cec5SDimitry Andric   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
17610b57cec5SDimitry Andric }
17620b57cec5SDimitry Andric 
17630b57cec5SDimitry Andric int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
17640b57cec5SDimitry Andric   int FPAtomicToDenormModeWaitStates = 3;
17650b57cec5SDimitry Andric 
17660b57cec5SDimitry Andric   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
17670b57cec5SDimitry Andric     return 0;
17680b57cec5SDimitry Andric 
1769fe6060f1SDimitry Andric   auto IsHazardFn = [](const MachineInstr &I) {
1770fe6060f1SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
17710b57cec5SDimitry Andric       return false;
1772fe6060f1SDimitry Andric     return SIInstrInfo::isFPAtomic(I);
17730b57cec5SDimitry Andric   };
17740b57cec5SDimitry Andric 
1775fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1776fe6060f1SDimitry Andric     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
17770b57cec5SDimitry Andric       return true;
17780b57cec5SDimitry Andric 
1779fe6060f1SDimitry Andric     switch (MI.getOpcode()) {
17800b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT:
17810b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_VSCNT:
17820b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_VMCNT:
17830b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_EXPCNT:
17840b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_LGKMCNT:
1785e8d8bef9SDimitry Andric     case AMDGPU::S_WAIT_IDLE:
17860b57cec5SDimitry Andric       return true;
17870b57cec5SDimitry Andric     default:
17880b57cec5SDimitry Andric       break;
17890b57cec5SDimitry Andric     }
17900b57cec5SDimitry Andric 
17910b57cec5SDimitry Andric     return false;
17920b57cec5SDimitry Andric   };
17930b57cec5SDimitry Andric 
17940b57cec5SDimitry Andric   return FPAtomicToDenormModeWaitStates -
17950b57cec5SDimitry Andric          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
17960b57cec5SDimitry Andric }
17970b57cec5SDimitry Andric 
17980b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
17990b57cec5SDimitry Andric   assert(SIInstrInfo::isMAI(*MI));
18000b57cec5SDimitry Andric 
1801fe6060f1SDimitry Andric   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1802fe6060f1SDimitry Andric }
1803fe6060f1SDimitry Andric 
1804*81ad6265SDimitry Andric int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1805*81ad6265SDimitry Andric   // Early exit if no padding is requested.
1806*81ad6265SDimitry Andric   if (MFMAPaddingRatio == 0)
1807*81ad6265SDimitry Andric     return 0;
1808*81ad6265SDimitry Andric 
1809*81ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1810*81ad6265SDimitry Andric   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1811*81ad6265SDimitry Andric     return 0;
1812*81ad6265SDimitry Andric 
1813*81ad6265SDimitry Andric   int NeighborMFMALatency = 0;
1814*81ad6265SDimitry Andric   auto IsNeighboringMFMA = [&NeighborMFMALatency,
1815*81ad6265SDimitry Andric                             this](const MachineInstr &MI) {
1816*81ad6265SDimitry Andric     if (!SIInstrInfo::isMFMA(MI))
1817*81ad6265SDimitry Andric       return false;
1818*81ad6265SDimitry Andric 
1819*81ad6265SDimitry Andric     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1820*81ad6265SDimitry Andric     return true;
1821*81ad6265SDimitry Andric   };
1822*81ad6265SDimitry Andric 
1823*81ad6265SDimitry Andric   const int MaxMFMAPipelineWaitStates = 16;
1824*81ad6265SDimitry Andric   int WaitStatesSinceNeighborMFMA =
1825*81ad6265SDimitry Andric       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1826*81ad6265SDimitry Andric 
1827*81ad6265SDimitry Andric   int NeighborMFMAPaddingNeeded =
1828*81ad6265SDimitry Andric       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1829*81ad6265SDimitry Andric       WaitStatesSinceNeighborMFMA;
1830*81ad6265SDimitry Andric 
1831*81ad6265SDimitry Andric   return std::max(0, NeighborMFMAPaddingNeeded);
1832*81ad6265SDimitry Andric }
1833*81ad6265SDimitry Andric 
1834fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
18350b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
18360b57cec5SDimitry Andric   unsigned Opc = MI->getOpcode();
18370b57cec5SDimitry Andric 
1838fe6060f1SDimitry Andric   auto IsVALUFn = [](const MachineInstr &MI) {
1839fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI);
18400b57cec5SDimitry Andric   };
18410b57cec5SDimitry Andric 
1842e8d8bef9SDimitry Andric   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
18430b57cec5SDimitry Andric     const int LegacyVALUWritesVGPRWaitStates = 2;
18440b57cec5SDimitry Andric     const int VALUWritesExecWaitStates = 4;
18450b57cec5SDimitry Andric     const int MaxWaitStates = 4;
18460b57cec5SDimitry Andric 
18470b57cec5SDimitry Andric     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
18480b57cec5SDimitry Andric       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
18490b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
18500b57cec5SDimitry Andric 
18510b57cec5SDimitry Andric     if (WaitStatesNeeded < MaxWaitStates) {
18520b57cec5SDimitry Andric       for (const MachineOperand &Use : MI->explicit_uses()) {
18530b57cec5SDimitry Andric         const int MaxWaitStates = 2;
18540b57cec5SDimitry Andric 
18550b57cec5SDimitry Andric         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
18560b57cec5SDimitry Andric           continue;
18570b57cec5SDimitry Andric 
18580b57cec5SDimitry Andric         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
18590b57cec5SDimitry Andric           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
18600b57cec5SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
18610b57cec5SDimitry Andric 
18620b57cec5SDimitry Andric         if (WaitStatesNeeded == MaxWaitStates)
18630b57cec5SDimitry Andric           break;
18640b57cec5SDimitry Andric       }
18650b57cec5SDimitry Andric     }
18660b57cec5SDimitry Andric   }
18670b57cec5SDimitry Andric 
18680b57cec5SDimitry Andric   for (const MachineOperand &Op : MI->explicit_operands()) {
18690b57cec5SDimitry Andric     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
18700b57cec5SDimitry Andric       continue;
18710b57cec5SDimitry Andric 
1872e8d8bef9SDimitry Andric     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
18730b57cec5SDimitry Andric       continue;
18740b57cec5SDimitry Andric 
18750b57cec5SDimitry Andric     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
18760b57cec5SDimitry Andric     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
18770b57cec5SDimitry Andric     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
18780b57cec5SDimitry Andric     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
18790b57cec5SDimitry Andric     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
18800b57cec5SDimitry Andric     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
18810b57cec5SDimitry Andric     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
18820b57cec5SDimitry Andric     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
18830b57cec5SDimitry Andric     const int MaxWaitStates = 18;
18848bcb0991SDimitry Andric     Register Reg = Op.getReg();
18850b57cec5SDimitry Andric     unsigned HazardDefLatency = 0;
18860b57cec5SDimitry Andric 
1887*81ad6265SDimitry Andric     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
1888fe6060f1SDimitry Andric                                this](const MachineInstr &MI) {
1889*81ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
18900b57cec5SDimitry Andric         return false;
1891fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
18920b57cec5SDimitry Andric       if (DstReg == Reg)
18930b57cec5SDimitry Andric         return false;
1894fe6060f1SDimitry Andric       HazardDefLatency =
1895fe6060f1SDimitry Andric           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
18960b57cec5SDimitry Andric       return TRI.regsOverlap(DstReg, Reg);
18970b57cec5SDimitry Andric     };
18980b57cec5SDimitry Andric 
18990b57cec5SDimitry Andric     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
19000b57cec5SDimitry Andric                                                    MaxWaitStates);
19010b57cec5SDimitry Andric     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
19020b57cec5SDimitry Andric     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
19030b57cec5SDimitry Andric     int OpNo = MI->getOperandNo(&Op);
19040b57cec5SDimitry Andric     if (OpNo == SrcCIdx) {
19050b57cec5SDimitry Andric       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1906e8d8bef9SDimitry Andric     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
19070b57cec5SDimitry Andric       switch (HazardDefLatency) {
19080b57cec5SDimitry Andric       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
19090b57cec5SDimitry Andric                break;
19100b57cec5SDimitry Andric       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
19110b57cec5SDimitry Andric                break;
19120b57cec5SDimitry Andric       case 16: LLVM_FALLTHROUGH;
19130b57cec5SDimitry Andric       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
19140b57cec5SDimitry Andric                break;
19150b57cec5SDimitry Andric       }
1916e8d8bef9SDimitry Andric     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
19170b57cec5SDimitry Andric       switch (HazardDefLatency) {
19180b57cec5SDimitry Andric       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
19190b57cec5SDimitry Andric                break;
19200b57cec5SDimitry Andric       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
19210b57cec5SDimitry Andric                break;
19220b57cec5SDimitry Andric       case 16: LLVM_FALLTHROUGH;
19230b57cec5SDimitry Andric       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
19240b57cec5SDimitry Andric                break;
19250b57cec5SDimitry Andric       }
19260b57cec5SDimitry Andric     }
19270b57cec5SDimitry Andric 
19280b57cec5SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
19290b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
19300b57cec5SDimitry Andric 
19310b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
19320b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
19330b57cec5SDimitry Andric 
1934fe6060f1SDimitry Andric     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1935fe6060f1SDimitry Andric       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
19360b57cec5SDimitry Andric         return false;
1937fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
19380b57cec5SDimitry Andric       return TRI.regsOverlap(Reg, DstReg);
19390b57cec5SDimitry Andric     };
19400b57cec5SDimitry Andric 
19410b57cec5SDimitry Andric     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
19420b57cec5SDimitry Andric     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
19430b57cec5SDimitry Andric     const int AccVGPRWriteAccVgprReadWaitStates = 3;
19440b57cec5SDimitry Andric     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
19450b57cec5SDimitry Andric     if (OpNo == SrcCIdx)
19460b57cec5SDimitry Andric       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1947e8d8bef9SDimitry Andric     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
19480b57cec5SDimitry Andric       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
19490b57cec5SDimitry Andric 
19500b57cec5SDimitry Andric     WaitStatesNeededForUse = NeedWaitStates -
19510b57cec5SDimitry Andric       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
19520b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
19530b57cec5SDimitry Andric 
19540b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
19550b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
19560b57cec5SDimitry Andric   }
19570b57cec5SDimitry Andric 
1958e8d8bef9SDimitry Andric   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
19590b57cec5SDimitry Andric     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
19600b57cec5SDimitry Andric     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
19610b57cec5SDimitry Andric     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
19620b57cec5SDimitry Andric     const int MaxWaitStates = 13;
19638bcb0991SDimitry Andric     Register DstReg = MI->getOperand(0).getReg();
19640b57cec5SDimitry Andric     unsigned HazardDefLatency = 0;
19650b57cec5SDimitry Andric 
1966*81ad6265SDimitry Andric     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
1967fe6060f1SDimitry Andric                          this](const MachineInstr &MI) {
1968*81ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
19690b57cec5SDimitry Andric         return false;
1970fe6060f1SDimitry Andric       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1971fe6060f1SDimitry Andric       HazardDefLatency =
1972fe6060f1SDimitry Andric           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
19730b57cec5SDimitry Andric       return TRI.regsOverlap(Reg, DstReg);
19740b57cec5SDimitry Andric     };
19750b57cec5SDimitry Andric 
19760b57cec5SDimitry Andric     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
19770b57cec5SDimitry Andric     int NeedWaitStates;
19780b57cec5SDimitry Andric     switch (HazardDefLatency) {
19790b57cec5SDimitry Andric     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
19800b57cec5SDimitry Andric              break;
19810b57cec5SDimitry Andric     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
19820b57cec5SDimitry Andric              break;
19830b57cec5SDimitry Andric     case 16: LLVM_FALLTHROUGH;
19840b57cec5SDimitry Andric     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
19850b57cec5SDimitry Andric              break;
19860b57cec5SDimitry Andric     }
19870b57cec5SDimitry Andric 
19880b57cec5SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
19890b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
19900b57cec5SDimitry Andric   }
19910b57cec5SDimitry Andric 
1992*81ad6265SDimitry Andric   // Pad neighboring MFMA with noops for better inter-wave performance.
1993*81ad6265SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
1994*81ad6265SDimitry Andric 
19950b57cec5SDimitry Andric   return WaitStatesNeeded;
19960b57cec5SDimitry Andric }
19970b57cec5SDimitry Andric 
1998fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1999fe6060f1SDimitry Andric   int WaitStatesNeeded = 0;
2000fe6060f1SDimitry Andric   unsigned Opc = MI->getOpcode();
2001fe6060f1SDimitry Andric 
2002*81ad6265SDimitry Andric   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2003*81ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2004fe6060f1SDimitry Andric   };
2005fe6060f1SDimitry Andric 
2006*81ad6265SDimitry Andric   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2007*81ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2008*81ad6265SDimitry Andric            !SIInstrInfo::isDOT(MI);
2009fe6060f1SDimitry Andric   };
2010fe6060f1SDimitry Andric 
2011*81ad6265SDimitry Andric   if (!SIInstrInfo::isMFMA(*MI))
2012fe6060f1SDimitry Andric     return WaitStatesNeeded;
2013fe6060f1SDimitry Andric 
2014fe6060f1SDimitry Andric   const int VALUWritesExecWaitStates = 4;
2015fe6060f1SDimitry Andric   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2016fe6060f1SDimitry Andric     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2017fe6060f1SDimitry Andric                           VALUWritesExecWaitStates);
2018fe6060f1SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2019fe6060f1SDimitry Andric 
2020fe6060f1SDimitry Andric   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2021fe6060f1SDimitry Andric 
2022fe6060f1SDimitry Andric   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2023fe6060f1SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
2024fe6060f1SDimitry Andric     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2025fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2026*81ad6265SDimitry Andric     const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2027*81ad6265SDimitry Andric     const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2028*81ad6265SDimitry Andric     const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2029*81ad6265SDimitry Andric     const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2030*81ad6265SDimitry Andric     const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2031*81ad6265SDimitry Andric     const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2032*81ad6265SDimitry Andric     const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2033fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2034fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2035fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2036fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2037fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2038fe6060f1SDimitry Andric     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2039fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2040fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2041fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2042fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2043*81ad6265SDimitry Andric     const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2044*81ad6265SDimitry Andric     const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2045*81ad6265SDimitry Andric     const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2046*81ad6265SDimitry Andric     const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2047*81ad6265SDimitry Andric     const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2048*81ad6265SDimitry Andric     const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2049*81ad6265SDimitry Andric     const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2050*81ad6265SDimitry Andric     const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2051fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2052fe6060f1SDimitry Andric     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2053fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2054*81ad6265SDimitry Andric     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2055fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2056fe6060f1SDimitry Andric 
2057fe6060f1SDimitry Andric     if (!Use.isReg())
2058fe6060f1SDimitry Andric       continue;
205904eeddc0SDimitry Andric     Register Reg = Use.getReg();
2060fe6060f1SDimitry Andric     bool FullReg;
2061fe6060f1SDimitry Andric     const MachineInstr *MI1;
2062fe6060f1SDimitry Andric 
2063*81ad6265SDimitry Andric     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2064fe6060f1SDimitry Andric                                this](const MachineInstr &MI) {
2065*81ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
2066fe6060f1SDimitry Andric         return false;
2067fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
2068fe6060f1SDimitry Andric       FullReg = (DstReg == Reg);
2069fe6060f1SDimitry Andric       MI1 = &MI;
2070fe6060f1SDimitry Andric       return TRI.regsOverlap(DstReg, Reg);
2071fe6060f1SDimitry Andric     };
2072fe6060f1SDimitry Andric 
2073fe6060f1SDimitry Andric     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2074fe6060f1SDimitry Andric       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2075fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2076fe6060f1SDimitry Andric 
20774824e7fdSDimitry Andric     int NumWaitStates =
20784824e7fdSDimitry Andric         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2079fe6060f1SDimitry Andric     if (NumWaitStates == std::numeric_limits<int>::max())
2080fe6060f1SDimitry Andric       continue;
2081fe6060f1SDimitry Andric 
2082fe6060f1SDimitry Andric     int OpNo = MI->getOperandNo(&Use);
2083fe6060f1SDimitry Andric     unsigned Opc1 = MI1->getOpcode();
2084fe6060f1SDimitry Andric     int NeedWaitStates = 0;
2085fe6060f1SDimitry Andric     if (OpNo == SrcCIdx) {
2086*81ad6265SDimitry Andric       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2087fe6060f1SDimitry Andric         NeedWaitStates = 0;
2088fe6060f1SDimitry Andric       } else if (FullReg) {
2089fe6060f1SDimitry Andric         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2090fe6060f1SDimitry Andric              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2091fe6060f1SDimitry Andric             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2092fe6060f1SDimitry Andric              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2093fe6060f1SDimitry Andric           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2094*81ad6265SDimitry Andric         else if (ST.hasGFX940Insts() &&
2095*81ad6265SDimitry Andric                  TSchedModel.computeInstrLatency(MI1) == 2)
2096*81ad6265SDimitry Andric           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2097fe6060f1SDimitry Andric       } else {
2098fe6060f1SDimitry Andric         switch (Opc1) {
2099fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2100fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
210104eeddc0SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
210204eeddc0SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2103fe6060f1SDimitry Andric           if (!isXDL(ST, *MI))
2104fe6060f1SDimitry Andric             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2105fe6060f1SDimitry Andric           break;
2106fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2107fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2108fe6060f1SDimitry Andric           if (!isXDL(ST, *MI))
2109fe6060f1SDimitry Andric             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2110fe6060f1SDimitry Andric           break;
2111fe6060f1SDimitry Andric         default:
2112*81ad6265SDimitry Andric           if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
2113*81ad6265SDimitry Andric             break;
2114fe6060f1SDimitry Andric           switch (TSchedModel.computeInstrLatency(MI1)) {
2115fe6060f1SDimitry Andric           case 2:
2116*81ad6265SDimitry Andric             NeedWaitStates = ST.hasGFX940Insts()
2117*81ad6265SDimitry Andric               ? isXDL(ST, *MI1)
2118*81ad6265SDimitry Andric                 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2119*81ad6265SDimitry Andric                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2120*81ad6265SDimitry Andric               : isDGEMM(Opc)
2121fe6060f1SDimitry Andric                 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2122fe6060f1SDimitry Andric                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2123fe6060f1SDimitry Andric             break;
2124*81ad6265SDimitry Andric           case 4:
2125*81ad6265SDimitry Andric             assert(ST.hasGFX940Insts());
2126*81ad6265SDimitry Andric             NeedWaitStates = isXDL(ST, *MI1)
2127*81ad6265SDimitry Andric               ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2128*81ad6265SDimitry Andric               : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2129*81ad6265SDimitry Andric             break;
2130fe6060f1SDimitry Andric           case 8:
2131*81ad6265SDimitry Andric             NeedWaitStates = ST.hasGFX940Insts()
2132*81ad6265SDimitry Andric               ? isXDL(ST, *MI1)
2133*81ad6265SDimitry Andric                 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2134*81ad6265SDimitry Andric                 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2135*81ad6265SDimitry Andric               : isDGEMM(Opc)
2136fe6060f1SDimitry Andric                 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2137fe6060f1SDimitry Andric                 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2138fe6060f1SDimitry Andric             break;
2139fe6060f1SDimitry Andric           case 16: LLVM_FALLTHROUGH;
2140fe6060f1SDimitry Andric           default:
2141*81ad6265SDimitry Andric             NeedWaitStates = ST.hasGFX940Insts()
2142*81ad6265SDimitry Andric               ? isXDL(ST, *MI1)
2143*81ad6265SDimitry Andric                 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2144*81ad6265SDimitry Andric                 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2145*81ad6265SDimitry Andric               : isDGEMM(Opc)
2146fe6060f1SDimitry Andric                 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2147fe6060f1SDimitry Andric                 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2148fe6060f1SDimitry Andric           }
2149fe6060f1SDimitry Andric         }
2150fe6060f1SDimitry Andric       }
2151fe6060f1SDimitry Andric     } else {
2152fe6060f1SDimitry Andric       switch (Opc1) {
2153fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2154fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
215504eeddc0SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
215604eeddc0SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2157fe6060f1SDimitry Andric         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2158fe6060f1SDimitry Andric         break;
2159fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2160fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2161fe6060f1SDimitry Andric         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2162fe6060f1SDimitry Andric         break;
2163fe6060f1SDimitry Andric       default:
2164fe6060f1SDimitry Andric         switch (TSchedModel.computeInstrLatency(MI1)) {
2165fe6060f1SDimitry Andric         case 2:
2166*81ad6265SDimitry Andric           NeedWaitStates = ST.hasGFX940Insts()
2167*81ad6265SDimitry Andric             ? isXDL(ST, *MI1)
2168*81ad6265SDimitry Andric               ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2169*81ad6265SDimitry Andric               : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2170*81ad6265SDimitry Andric             : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2171*81ad6265SDimitry Andric           break;
2172*81ad6265SDimitry Andric         case 4:
2173*81ad6265SDimitry Andric           assert(ST.hasGFX940Insts());
2174*81ad6265SDimitry Andric           NeedWaitStates = isXDL(ST, *MI1)
2175*81ad6265SDimitry Andric             ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2176*81ad6265SDimitry Andric             : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2177fe6060f1SDimitry Andric           break;
2178fe6060f1SDimitry Andric         case 8:
2179*81ad6265SDimitry Andric           NeedWaitStates = ST.hasGFX940Insts()
2180*81ad6265SDimitry Andric             ? isXDL(ST, *MI1)
2181*81ad6265SDimitry Andric               ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2182*81ad6265SDimitry Andric               : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2183*81ad6265SDimitry Andric             : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2184fe6060f1SDimitry Andric           break;
2185fe6060f1SDimitry Andric         case 16: LLVM_FALLTHROUGH;
2186fe6060f1SDimitry Andric         default:
2187*81ad6265SDimitry Andric           NeedWaitStates = ST.hasGFX940Insts()
2188*81ad6265SDimitry Andric             ? isXDL(ST, *MI1)
2189*81ad6265SDimitry Andric               ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2190*81ad6265SDimitry Andric               : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2191*81ad6265SDimitry Andric             : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2192fe6060f1SDimitry Andric         }
2193fe6060f1SDimitry Andric       }
2194fe6060f1SDimitry Andric     }
2195fe6060f1SDimitry Andric     if (WaitStatesNeeded >= NeedWaitStates)
2196fe6060f1SDimitry Andric       continue;
2197fe6060f1SDimitry Andric 
2198fe6060f1SDimitry Andric     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2199fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2200fe6060f1SDimitry Andric 
2201fe6060f1SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
2202fe6060f1SDimitry Andric       break;
2203fe6060f1SDimitry Andric   }
2204fe6060f1SDimitry Andric 
2205fe6060f1SDimitry Andric   return WaitStatesNeeded;
2206fe6060f1SDimitry Andric }
2207fe6060f1SDimitry Andric 
22080b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2209349cc55cSDimitry Andric   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2210fe6060f1SDimitry Andric   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
22110b57cec5SDimitry Andric     return 0;
22120b57cec5SDimitry Andric 
22130b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
22140b57cec5SDimitry Andric 
2215fe6060f1SDimitry Andric   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2216fe6060f1SDimitry Andric     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
22170b57cec5SDimitry Andric   };
22180b57cec5SDimitry Andric 
22190b57cec5SDimitry Andric   for (const MachineOperand &Op : MI->explicit_uses()) {
22200b57cec5SDimitry Andric     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
22210b57cec5SDimitry Andric       continue;
22220b57cec5SDimitry Andric 
22238bcb0991SDimitry Andric     Register Reg = Op.getReg();
22240b57cec5SDimitry Andric 
22250b57cec5SDimitry Andric     const int AccVgprReadLdStWaitStates = 2;
2226e8d8bef9SDimitry Andric     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
22270b57cec5SDimitry Andric     const int MaxWaitStates = 2;
22280b57cec5SDimitry Andric 
22290b57cec5SDimitry Andric     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
22300b57cec5SDimitry Andric       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
22310b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
22320b57cec5SDimitry Andric 
22330b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
22340b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
22350b57cec5SDimitry Andric 
2236fe6060f1SDimitry Andric     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2237fe6060f1SDimitry Andric       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2238fe6060f1SDimitry Andric           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
22390b57cec5SDimitry Andric         return false;
2240fe6060f1SDimitry Andric       auto IsVALUFn = [](const MachineInstr &MI) {
2241fe6060f1SDimitry Andric         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
22420b57cec5SDimitry Andric       };
22430b57cec5SDimitry Andric       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
22440b57cec5SDimitry Andric              std::numeric_limits<int>::max();
22450b57cec5SDimitry Andric     };
22460b57cec5SDimitry Andric 
2247e8d8bef9SDimitry Andric     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2248e8d8bef9SDimitry Andric       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
22490b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
22500b57cec5SDimitry Andric   }
22510b57cec5SDimitry Andric 
22520b57cec5SDimitry Andric   return WaitStatesNeeded;
22530b57cec5SDimitry Andric }
2254e8d8bef9SDimitry Andric 
2255fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2256fe6060f1SDimitry Andric   if (!ST.hasGFX90AInsts())
2257fe6060f1SDimitry Andric     return 0;
2258fe6060f1SDimitry Andric 
2259fe6060f1SDimitry Andric   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2260fe6060f1SDimitry Andric     return isDGEMM(MI.getOpcode());
2261fe6060f1SDimitry Andric   };
2262fe6060f1SDimitry Andric 
2263fe6060f1SDimitry Andric   // This is checked in checkMAIHazards90A()
2264*81ad6265SDimitry Andric   if (SIInstrInfo::isMFMA(*MI))
2265fe6060f1SDimitry Andric     return 0;
2266fe6060f1SDimitry Andric 
2267fe6060f1SDimitry Andric   int WaitStatesNeeded = 0;
2268fe6060f1SDimitry Andric 
2269fe6060f1SDimitry Andric   bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
2270fe6060f1SDimitry Andric                        SIInstrInfo::isFLAT(*MI) ||
2271fe6060f1SDimitry Andric                        SIInstrInfo::isDS(*MI) ||
2272fe6060f1SDimitry Andric                        SIInstrInfo::isEXP(*MI);
2273fe6060f1SDimitry Andric   bool IsVALU = SIInstrInfo::isVALU(*MI);
2274fe6060f1SDimitry Andric 
2275fe6060f1SDimitry Andric   const MachineInstr *MFMA = nullptr;
2276fe6060f1SDimitry Andric   unsigned Reg;
2277*81ad6265SDimitry Andric   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2278*81ad6265SDimitry Andric     if (!SIInstrInfo::isMFMA(MI) ||
2279*81ad6265SDimitry Andric         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2280fe6060f1SDimitry Andric       return false;
2281fe6060f1SDimitry Andric     MFMA = &MI;
2282fe6060f1SDimitry Andric     return true;
2283fe6060f1SDimitry Andric   };
2284fe6060f1SDimitry Andric 
2285fe6060f1SDimitry Andric   const MachineInstr *DOT = nullptr;
2286fe6060f1SDimitry Andric   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2287fe6060f1SDimitry Andric     if (!SIInstrInfo::isDOT(MI) ||
2288fe6060f1SDimitry Andric         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2289fe6060f1SDimitry Andric       return false;
2290fe6060f1SDimitry Andric     DOT = &MI;
2291fe6060f1SDimitry Andric     return true;
2292fe6060f1SDimitry Andric   };
2293fe6060f1SDimitry Andric 
2294fe6060f1SDimitry Andric   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2295fe6060f1SDimitry Andric                                            AMDGPU::OpName::src2);
2296fe6060f1SDimitry Andric 
2297fe6060f1SDimitry Andric   if (IsMemOrExport || IsVALU) {
2298fe6060f1SDimitry Andric     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2299fe6060f1SDimitry Andric     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2300fe6060f1SDimitry Andric     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2301*81ad6265SDimitry Andric     const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2302*81ad6265SDimitry Andric     const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2303*81ad6265SDimitry Andric     const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2304*81ad6265SDimitry Andric     const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2305*81ad6265SDimitry Andric     const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2306*81ad6265SDimitry Andric     const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2307*81ad6265SDimitry Andric     const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2308*81ad6265SDimitry Andric     const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2309fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2310fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2311fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2312fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2313fe6060f1SDimitry Andric     const int DotWriteSameDotReadSrcAB = 3;
2314fe6060f1SDimitry Andric     const int DotWriteDifferentVALURead = 3;
2315fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2316fe6060f1SDimitry Andric 
2317fe6060f1SDimitry Andric     for (const MachineOperand &Use : MI->explicit_uses()) {
2318fe6060f1SDimitry Andric       if (!Use.isReg())
2319fe6060f1SDimitry Andric         continue;
2320fe6060f1SDimitry Andric       Reg = Use.getReg();
2321fe6060f1SDimitry Andric 
2322fe6060f1SDimitry Andric       DOT = nullptr;
2323fe6060f1SDimitry Andric       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2324fe6060f1SDimitry Andric                                                      MaxWaitStates);
2325fe6060f1SDimitry Andric       if (DOT) {
2326fe6060f1SDimitry Andric         int NeedWaitStates = 0;
2327fe6060f1SDimitry Andric         if (DOT->getOpcode() == MI->getOpcode()) {
2328fe6060f1SDimitry Andric           if (&Use - &MI->getOperand(0) != SrcCIdx)
2329fe6060f1SDimitry Andric             NeedWaitStates = DotWriteSameDotReadSrcAB;
2330fe6060f1SDimitry Andric         } else {
2331fe6060f1SDimitry Andric           NeedWaitStates = DotWriteDifferentVALURead;
2332fe6060f1SDimitry Andric         }
2333fe6060f1SDimitry Andric 
2334fe6060f1SDimitry Andric         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2335fe6060f1SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2336fe6060f1SDimitry Andric       }
2337fe6060f1SDimitry Andric 
2338fe6060f1SDimitry Andric       MFMA = nullptr;
23394824e7fdSDimitry Andric       WaitStatesSinceDef =
23404824e7fdSDimitry Andric           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2341fe6060f1SDimitry Andric       if (!MFMA)
2342fe6060f1SDimitry Andric         continue;
2343fe6060f1SDimitry Andric 
2344fe6060f1SDimitry Andric       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2345fe6060f1SDimitry Andric       int NeedWaitStates = MaxWaitStates;
2346fe6060f1SDimitry Andric       switch (HazardDefLatency) {
2347fe6060f1SDimitry Andric       case 2:
2348*81ad6265SDimitry Andric         NeedWaitStates =
2349*81ad6265SDimitry Andric           ST.hasGFX940Insts()
2350*81ad6265SDimitry Andric             ? isXDL(ST, *MFMA)
2351*81ad6265SDimitry Andric               ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2352*81ad6265SDimitry Andric               : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2353*81ad6265SDimitry Andric             : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2354fe6060f1SDimitry Andric         break;
2355fe6060f1SDimitry Andric       case 4:
2356*81ad6265SDimitry Andric         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2357fe6060f1SDimitry Andric         NeedWaitStates =
2358*81ad6265SDimitry Andric           isDGEMM(MFMA->getOpcode())
2359*81ad6265SDimitry Andric             ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2360*81ad6265SDimitry Andric                             : DMFMA4x4WriteVgprVALUReadWaitStates
2361*81ad6265SDimitry Andric             : isXDL(ST, *MFMA)
2362*81ad6265SDimitry Andric               ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2363*81ad6265SDimitry Andric               : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2364fe6060f1SDimitry Andric         break;
2365fe6060f1SDimitry Andric       case 8:
2366*81ad6265SDimitry Andric         NeedWaitStates =
2367*81ad6265SDimitry Andric           ST.hasGFX940Insts()
2368*81ad6265SDimitry Andric             ? isXDL(ST, *MFMA)
2369*81ad6265SDimitry Andric               ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2370*81ad6265SDimitry Andric               : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2371*81ad6265SDimitry Andric             : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2372fe6060f1SDimitry Andric         break;
2373fe6060f1SDimitry Andric       case 16: LLVM_FALLTHROUGH;
2374fe6060f1SDimitry Andric       default:
2375fe6060f1SDimitry Andric         NeedWaitStates =
2376fe6060f1SDimitry Andric           isDGEMM(MFMA->getOpcode())
2377fe6060f1SDimitry Andric             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2378fe6060f1SDimitry Andric                             : DMFMA16x16WriteVgprVALUReadWaitStates
2379*81ad6265SDimitry Andric             : ST.hasGFX940Insts()
2380*81ad6265SDimitry Andric               ? isXDL(ST, *MFMA)
2381*81ad6265SDimitry Andric                 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2382*81ad6265SDimitry Andric                 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2383fe6060f1SDimitry Andric               : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2384fe6060f1SDimitry Andric         break;
2385fe6060f1SDimitry Andric       }
2386fe6060f1SDimitry Andric 
2387fe6060f1SDimitry Andric       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2388fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2389fe6060f1SDimitry Andric 
2390fe6060f1SDimitry Andric       if (WaitStatesNeeded == MaxWaitStates)
2391fe6060f1SDimitry Andric         break;
2392fe6060f1SDimitry Andric     }
2393fe6060f1SDimitry Andric   }
2394fe6060f1SDimitry Andric 
2395fe6060f1SDimitry Andric   unsigned Opc = MI->getOpcode();
2396fe6060f1SDimitry Andric   const int DMFMAToFMA64WaitStates = 2;
2397fe6060f1SDimitry Andric   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2398fe6060f1SDimitry Andric        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2399fe6060f1SDimitry Andric        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2400fe6060f1SDimitry Andric       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2401fe6060f1SDimitry Andric     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2402fe6060f1SDimitry Andric       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2403fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2404fe6060f1SDimitry Andric   }
2405fe6060f1SDimitry Andric 
2406fe6060f1SDimitry Andric   if (!IsVALU && !IsMemOrExport)
2407fe6060f1SDimitry Andric     return WaitStatesNeeded;
2408fe6060f1SDimitry Andric 
2409fe6060f1SDimitry Andric   for (const MachineOperand &Def : MI->defs()) {
2410fe6060f1SDimitry Andric     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2411fe6060f1SDimitry Andric     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2412fe6060f1SDimitry Andric     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2413*81ad6265SDimitry Andric     const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2414*81ad6265SDimitry Andric     const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2415*81ad6265SDimitry Andric     const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2416*81ad6265SDimitry Andric     const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2417*81ad6265SDimitry Andric     const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2418*81ad6265SDimitry Andric     const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2419*81ad6265SDimitry Andric     const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2420*81ad6265SDimitry Andric     const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2421fe6060f1SDimitry Andric     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2422*81ad6265SDimitry Andric     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2423fe6060f1SDimitry Andric     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2424fe6060f1SDimitry Andric     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2425fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2426fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2427fe6060f1SDimitry Andric     const int DotWriteDifferentVALUWrite = 3;
2428fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2429fe6060f1SDimitry Andric     const int MaxWarWaitStates = 15;
2430fe6060f1SDimitry Andric 
2431fe6060f1SDimitry Andric     Reg = Def.getReg();
2432fe6060f1SDimitry Andric 
2433fe6060f1SDimitry Andric     DOT = nullptr;
2434fe6060f1SDimitry Andric     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2435fe6060f1SDimitry Andric                                                    MaxWaitStates);
2436fe6060f1SDimitry Andric     if (DOT && DOT->getOpcode() != MI->getOpcode())
2437fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2438fe6060f1SDimitry Andric                                                     WaitStatesSinceDef);
2439fe6060f1SDimitry Andric 
2440fe6060f1SDimitry Andric     MFMA = nullptr;
24414824e7fdSDimitry Andric     WaitStatesSinceDef =
24424824e7fdSDimitry Andric         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2443fe6060f1SDimitry Andric     if (MFMA) {
2444fe6060f1SDimitry Andric       int NeedWaitStates = MaxWaitStates;
2445fe6060f1SDimitry Andric       switch (TSchedModel.computeInstrLatency(MFMA)) {
2446fe6060f1SDimitry Andric       case 2:
2447*81ad6265SDimitry Andric         NeedWaitStates = ST.hasGFX940Insts()
2448*81ad6265SDimitry Andric           ? isXDL(ST, *MFMA)
2449*81ad6265SDimitry Andric             ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2450*81ad6265SDimitry Andric             : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2451*81ad6265SDimitry Andric           : SMFMA4x4WriteVgprVALUWawWaitStates;
2452fe6060f1SDimitry Andric         break;
2453fe6060f1SDimitry Andric       case 4:
2454*81ad6265SDimitry Andric         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2455*81ad6265SDimitry Andric         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2456*81ad6265SDimitry Andric             ? DMFMA4x4WriteVgprVALUWriteWaitStates
2457*81ad6265SDimitry Andric             : isXDL(ST, *MFMA)
2458*81ad6265SDimitry Andric               ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2459*81ad6265SDimitry Andric               : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2460fe6060f1SDimitry Andric         break;
2461fe6060f1SDimitry Andric       case 8:
2462*81ad6265SDimitry Andric         NeedWaitStates = ST.hasGFX940Insts()
2463*81ad6265SDimitry Andric           ? isXDL(ST, *MFMA)
2464*81ad6265SDimitry Andric             ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2465*81ad6265SDimitry Andric             : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2466*81ad6265SDimitry Andric           : SMFMA16x16WriteVgprVALUWawWaitStates;
2467fe6060f1SDimitry Andric         break;
2468fe6060f1SDimitry Andric       case 16: LLVM_FALLTHROUGH;
2469fe6060f1SDimitry Andric       default:
2470fe6060f1SDimitry Andric         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2471fe6060f1SDimitry Andric                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
2472*81ad6265SDimitry Andric                    : ST.hasGFX940Insts()
2473*81ad6265SDimitry Andric                      ? isXDL(ST, *MFMA)
2474*81ad6265SDimitry Andric                        ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2475*81ad6265SDimitry Andric                        : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2476fe6060f1SDimitry Andric                    : SMFMA32x32WriteVgprVALUWawWaitStates;
2477fe6060f1SDimitry Andric         break;
2478fe6060f1SDimitry Andric       }
2479fe6060f1SDimitry Andric 
2480fe6060f1SDimitry Andric       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2481fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2482fe6060f1SDimitry Andric 
2483fe6060f1SDimitry Andric       if (WaitStatesNeeded == MaxWaitStates)
2484fe6060f1SDimitry Andric         break;
2485fe6060f1SDimitry Andric     }
2486fe6060f1SDimitry Andric 
2487*81ad6265SDimitry Andric     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2488*81ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2489fe6060f1SDimitry Andric           !MI.readsRegister(Reg, &TRI))
2490fe6060f1SDimitry Andric         return false;
2491fe6060f1SDimitry Andric 
2492*81ad6265SDimitry Andric       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2493*81ad6265SDimitry Andric         return false;
2494*81ad6265SDimitry Andric 
2495fe6060f1SDimitry Andric       const MachineOperand *SrcC =
2496fe6060f1SDimitry Andric           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2497fe6060f1SDimitry Andric       assert(SrcC);
2498fe6060f1SDimitry Andric       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2499fe6060f1SDimitry Andric         return false;
2500fe6060f1SDimitry Andric 
2501fe6060f1SDimitry Andric       MFMA = &MI;
2502fe6060f1SDimitry Andric       return true;
2503fe6060f1SDimitry Andric     };
2504fe6060f1SDimitry Andric 
2505fe6060f1SDimitry Andric     MFMA = nullptr;
2506fe6060f1SDimitry Andric     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2507fe6060f1SDimitry Andric                                                 MaxWarWaitStates);
2508fe6060f1SDimitry Andric     if (!MFMA)
2509fe6060f1SDimitry Andric       continue;
2510fe6060f1SDimitry Andric 
2511fe6060f1SDimitry Andric     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2512fe6060f1SDimitry Andric     int NeedWaitStates = MaxWaitStates;
2513fe6060f1SDimitry Andric     switch (HazardDefLatency) {
2514fe6060f1SDimitry Andric     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2515fe6060f1SDimitry Andric              break;
2516*81ad6265SDimitry Andric     case 4:  assert(ST.hasGFX940Insts());
2517*81ad6265SDimitry Andric              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2518*81ad6265SDimitry Andric              break;
2519fe6060f1SDimitry Andric     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2520fe6060f1SDimitry Andric              break;
2521fe6060f1SDimitry Andric     case 16: LLVM_FALLTHROUGH;
2522fe6060f1SDimitry Andric     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2523fe6060f1SDimitry Andric              break;
2524fe6060f1SDimitry Andric     }
2525fe6060f1SDimitry Andric 
2526fe6060f1SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2527fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2528fe6060f1SDimitry Andric   }
2529fe6060f1SDimitry Andric 
2530fe6060f1SDimitry Andric   return WaitStatesNeeded;
2531fe6060f1SDimitry Andric }
2532fe6060f1SDimitry Andric 
2533e8d8bef9SDimitry Andric bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2534e8d8bef9SDimitry Andric   if (!SU->isInstr())
2535e8d8bef9SDimitry Andric     return false;
2536e8d8bef9SDimitry Andric 
2537fe6060f1SDimitry Andric   const MachineInstr *MAI = nullptr;
2538*81ad6265SDimitry Andric 
2539fe6060f1SDimitry Andric   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2540e8d8bef9SDimitry Andric     MAI = nullptr;
2541*81ad6265SDimitry Andric     if (SIInstrInfo::isMFMA(MI))
2542fe6060f1SDimitry Andric       MAI = &MI;
2543e8d8bef9SDimitry Andric     return MAI != nullptr;
2544e8d8bef9SDimitry Andric   };
2545e8d8bef9SDimitry Andric 
2546e8d8bef9SDimitry Andric   MachineInstr *MI = SU->getInstr();
2547fe6060f1SDimitry Andric   if (IsMFMAFn(*MI)) {
2548e8d8bef9SDimitry Andric     int W = getWaitStatesSince(IsMFMAFn, 16);
2549e8d8bef9SDimitry Andric     if (MAI)
2550e8d8bef9SDimitry Andric       return W < (int)TSchedModel.computeInstrLatency(MAI);
2551e8d8bef9SDimitry Andric   }
2552e8d8bef9SDimitry Andric 
2553e8d8bef9SDimitry Andric   return false;
2554e8d8bef9SDimitry Andric }
2555