xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (revision b3edf4467982447620505a28fc82e38a414c07dc)
10b57cec5SDimitry Andric //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements hazard recognizers for scheduling on GCN processors.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric #include "GCNHazardRecognizer.h"
14e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
150b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1681ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/ScheduleDAG.h"
1906c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h"
200b57cec5SDimitry Andric 
210b57cec5SDimitry Andric using namespace llvm;
220b57cec5SDimitry Andric 
2381ad6265SDimitry Andric namespace {
2481ad6265SDimitry Andric 
2581ad6265SDimitry Andric struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
2681ad6265SDimitry Andric   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
2781ad6265SDimitry Andric 
2881ad6265SDimitry Andric   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
2981ad6265SDimitry Andric     if (Arg.getAsInteger(0, Value))
3081ad6265SDimitry Andric       return O.error("'" + Arg + "' value invalid for uint argument!");
3181ad6265SDimitry Andric 
3281ad6265SDimitry Andric     if (Value > 100)
3381ad6265SDimitry Andric       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
3481ad6265SDimitry Andric 
3581ad6265SDimitry Andric     return false;
3681ad6265SDimitry Andric   }
3781ad6265SDimitry Andric };
3881ad6265SDimitry Andric 
3981ad6265SDimitry Andric } // end anonymous namespace
4081ad6265SDimitry Andric 
4181ad6265SDimitry Andric static cl::opt<unsigned, false, MFMAPaddingRatioParser>
4281ad6265SDimitry Andric     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
4381ad6265SDimitry Andric                      cl::desc("Fill a percentage of the latency between "
4481ad6265SDimitry Andric                               "neighboring MFMA with s_nops."));
4581ad6265SDimitry Andric 
460b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4781ad6265SDimitry Andric // Hazard Recognizer Implementation
480b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
490b57cec5SDimitry Andric 
50fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51fe6060f1SDimitry Andric                                                  const GCNSubtarget &ST);
52fe6060f1SDimitry Andric 
530b57cec5SDimitry Andric GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
540b57cec5SDimitry Andric   IsHazardRecognizerMode(false),
550b57cec5SDimitry Andric   CurrCycleInstr(nullptr),
560b57cec5SDimitry Andric   MF(MF),
570b57cec5SDimitry Andric   ST(MF.getSubtarget<GCNSubtarget>()),
580b57cec5SDimitry Andric   TII(*ST.getInstrInfo()),
590b57cec5SDimitry Andric   TRI(TII.getRegisterInfo()),
600b57cec5SDimitry Andric   ClauseUses(TRI.getNumRegUnits()),
610b57cec5SDimitry Andric   ClauseDefs(TRI.getNumRegUnits()) {
62fe6060f1SDimitry Andric   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
630b57cec5SDimitry Andric   TSchedModel.init(&ST);
64fe6060f1SDimitry Andric   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
650b57cec5SDimitry Andric }
660b57cec5SDimitry Andric 
67e8d8bef9SDimitry Andric void GCNHazardRecognizer::Reset() {
68e8d8bef9SDimitry Andric   EmittedInstrs.clear();
69e8d8bef9SDimitry Andric }
70e8d8bef9SDimitry Andric 
710b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
720b57cec5SDimitry Andric   EmitInstruction(SU->getInstr());
730b57cec5SDimitry Andric }
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
760b57cec5SDimitry Andric   CurrCycleInstr = MI;
770b57cec5SDimitry Andric }
780b57cec5SDimitry Andric 
790b57cec5SDimitry Andric static bool isDivFMas(unsigned Opcode) {
80e8d8bef9SDimitry Andric   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
810b57cec5SDimitry Andric }
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric static bool isSGetReg(unsigned Opcode) {
840b57cec5SDimitry Andric   return Opcode == AMDGPU::S_GETREG_B32;
850b57cec5SDimitry Andric }
860b57cec5SDimitry Andric 
870b57cec5SDimitry Andric static bool isSSetReg(unsigned Opcode) {
88e8d8bef9SDimitry Andric   switch (Opcode) {
89e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_B32:
90e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_B32_mode:
91e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_IMM32_B32:
92e8d8bef9SDimitry Andric   case AMDGPU::S_SETREG_IMM32_B32_mode:
93e8d8bef9SDimitry Andric     return true;
94e8d8bef9SDimitry Andric   }
95e8d8bef9SDimitry Andric   return false;
960b57cec5SDimitry Andric }
970b57cec5SDimitry Andric 
980b57cec5SDimitry Andric static bool isRWLane(unsigned Opcode) {
990b57cec5SDimitry Andric   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
1000b57cec5SDimitry Andric }
1010b57cec5SDimitry Andric 
1020b57cec5SDimitry Andric static bool isRFE(unsigned Opcode) {
1030b57cec5SDimitry Andric   return Opcode == AMDGPU::S_RFE_B64;
1040b57cec5SDimitry Andric }
1050b57cec5SDimitry Andric 
1060b57cec5SDimitry Andric static bool isSMovRel(unsigned Opcode) {
1070b57cec5SDimitry Andric   switch (Opcode) {
1080b57cec5SDimitry Andric   case AMDGPU::S_MOVRELS_B32:
1090b57cec5SDimitry Andric   case AMDGPU::S_MOVRELS_B64:
1100b57cec5SDimitry Andric   case AMDGPU::S_MOVRELD_B32:
1110b57cec5SDimitry Andric   case AMDGPU::S_MOVRELD_B64:
1120b57cec5SDimitry Andric     return true;
1130b57cec5SDimitry Andric   default:
1140b57cec5SDimitry Andric     return false;
1150b57cec5SDimitry Andric   }
1160b57cec5SDimitry Andric }
1170b57cec5SDimitry Andric 
118fe6060f1SDimitry Andric static bool isDGEMM(unsigned Opcode) {
11981ad6265SDimitry Andric   return AMDGPU::getMAIIsDGEMM(Opcode);
120fe6060f1SDimitry Andric }
121fe6060f1SDimitry Andric 
122fe6060f1SDimitry Andric static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123fe6060f1SDimitry Andric   unsigned Opcode = MI.getOpcode();
124fe6060f1SDimitry Andric 
125fe6060f1SDimitry Andric   if (!SIInstrInfo::isMAI(MI) ||
126fe6060f1SDimitry Andric       isDGEMM(Opcode) ||
127fe6060f1SDimitry Andric       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128fe6060f1SDimitry Andric       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129fe6060f1SDimitry Andric     return false;
130fe6060f1SDimitry Andric 
13181ad6265SDimitry Andric   if (!ST.hasGFX940Insts())
132fe6060f1SDimitry Andric     return true;
13381ad6265SDimitry Andric 
13481ad6265SDimitry Andric   return AMDGPU::getMAIIsGFX940XDL(Opcode);
135fe6060f1SDimitry Andric }
136fe6060f1SDimitry Andric 
1370b57cec5SDimitry Andric static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
1380b57cec5SDimitry Andric                                     const MachineInstr &MI) {
1390b57cec5SDimitry Andric   if (TII.isAlwaysGDS(MI.getOpcode()))
1400b57cec5SDimitry Andric     return true;
1410b57cec5SDimitry Andric 
1420b57cec5SDimitry Andric   switch (MI.getOpcode()) {
1430b57cec5SDimitry Andric   case AMDGPU::S_SENDMSG:
1440b57cec5SDimitry Andric   case AMDGPU::S_SENDMSGHALT:
1450b57cec5SDimitry Andric   case AMDGPU::S_TTRACEDATA:
1460b57cec5SDimitry Andric     return true;
1470b57cec5SDimitry Andric   // These DS opcodes don't support GDS.
1480b57cec5SDimitry Andric   case AMDGPU::DS_NOP:
1490b57cec5SDimitry Andric   case AMDGPU::DS_PERMUTE_B32:
1500b57cec5SDimitry Andric   case AMDGPU::DS_BPERMUTE_B32:
1510b57cec5SDimitry Andric     return false;
1520b57cec5SDimitry Andric   default:
1530b57cec5SDimitry Andric     if (TII.isDS(MI.getOpcode())) {
1540b57cec5SDimitry Andric       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1550b57cec5SDimitry Andric                                            AMDGPU::OpName::gds);
1560b57cec5SDimitry Andric       if (MI.getOperand(GDS).getImm())
1570b57cec5SDimitry Andric         return true;
1580b57cec5SDimitry Andric     }
1590b57cec5SDimitry Andric     return false;
1600b57cec5SDimitry Andric   }
1610b57cec5SDimitry Andric }
1620b57cec5SDimitry Andric 
1630b57cec5SDimitry Andric static bool isPermlane(const MachineInstr &MI) {
1640b57cec5SDimitry Andric   unsigned Opcode = MI.getOpcode();
165e8d8bef9SDimitry Andric   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
1667a6dacacSDimitry Andric          Opcode == AMDGPU::V_PERMLANE64_B32 ||
1675f757f3fSDimitry Andric          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
1685f757f3fSDimitry Andric          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
1695f757f3fSDimitry Andric          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
1700b57cec5SDimitry Andric }
1710b57cec5SDimitry Andric 
17281ad6265SDimitry Andric static bool isLdsDma(const MachineInstr &MI) {
17381ad6265SDimitry Andric   return SIInstrInfo::isVALU(MI) &&
17481ad6265SDimitry Andric          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
17581ad6265SDimitry Andric }
17681ad6265SDimitry Andric 
1770b57cec5SDimitry Andric static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
1780b57cec5SDimitry Andric   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
1790b57cec5SDimitry Andric                                                      AMDGPU::OpName::simm16);
1800b57cec5SDimitry Andric   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
1810b57cec5SDimitry Andric }
1820b57cec5SDimitry Andric 
1830b57cec5SDimitry Andric ScheduleHazardRecognizer::HazardType
1840b57cec5SDimitry Andric GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
1850b57cec5SDimitry Andric   MachineInstr *MI = SU->getInstr();
186e8d8bef9SDimitry Andric   // If we are not in "HazardRecognizerMode" and therefore not being run from
187e8d8bef9SDimitry Andric   // the scheduler, track possible stalls from hazards but don't insert noops.
188e8d8bef9SDimitry Andric   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
189e8d8bef9SDimitry Andric 
1900b57cec5SDimitry Andric   if (MI->isBundle())
1910b57cec5SDimitry Andric    return NoHazard;
1920b57cec5SDimitry Andric 
1930b57cec5SDimitry Andric   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
194e8d8bef9SDimitry Andric     return HazardType;
1950b57cec5SDimitry Andric 
1960b57cec5SDimitry Andric   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
197e8d8bef9SDimitry Andric     return HazardType;
1980b57cec5SDimitry Andric 
1990b57cec5SDimitry Andric   if (checkFPAtomicToDenormModeHazard(MI) > 0)
200e8d8bef9SDimitry Andric     return HazardType;
2010b57cec5SDimitry Andric 
2020b57cec5SDimitry Andric   if (ST.hasNoDataDepHazard())
2030b57cec5SDimitry Andric     return NoHazard;
2040b57cec5SDimitry Andric 
205fe6060f1SDimitry Andric   // FIXME: Should flat be considered vmem?
206fe6060f1SDimitry Andric   if ((SIInstrInfo::isVMEM(*MI) ||
207fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI))
208fe6060f1SDimitry Andric       && checkVMEMHazards(MI) > 0)
209fe6060f1SDimitry Andric     return HazardType;
210fe6060f1SDimitry Andric 
2110b57cec5SDimitry Andric   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
212e8d8bef9SDimitry Andric     return HazardType;
2130b57cec5SDimitry Andric 
2140b57cec5SDimitry Andric   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
215e8d8bef9SDimitry Andric     return HazardType;
2160b57cec5SDimitry Andric 
2170b57cec5SDimitry Andric   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
218e8d8bef9SDimitry Andric     return HazardType;
2190b57cec5SDimitry Andric 
2200b57cec5SDimitry Andric   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
221e8d8bef9SDimitry Andric     return HazardType;
2220b57cec5SDimitry Andric 
223fe6060f1SDimitry Andric   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
224fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
225fe6060f1SDimitry Andric        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
226fe6060f1SDimitry Andric     return HazardType;
227fe6060f1SDimitry Andric 
2280b57cec5SDimitry Andric   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
229e8d8bef9SDimitry Andric     return HazardType;
2300b57cec5SDimitry Andric 
2310b57cec5SDimitry Andric   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
232e8d8bef9SDimitry Andric     return HazardType;
2330b57cec5SDimitry Andric 
2340b57cec5SDimitry Andric   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
235e8d8bef9SDimitry Andric     return HazardType;
2360b57cec5SDimitry Andric 
23781ad6265SDimitry Andric   if (((ST.hasReadM0MovRelInterpHazard() &&
238bdd1243dSDimitry Andric         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
239bdd1243dSDimitry Andric          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240bdd1243dSDimitry Andric          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
24181ad6265SDimitry Andric        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
24281ad6265SDimitry Andric        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
24381ad6265SDimitry Andric        (ST.hasReadM0LdsDirectHazard() &&
24481ad6265SDimitry Andric         MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
2450b57cec5SDimitry Andric       checkReadM0Hazards(MI) > 0)
246e8d8bef9SDimitry Andric     return HazardType;
2470b57cec5SDimitry Andric 
2480b57cec5SDimitry Andric   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
249e8d8bef9SDimitry Andric     return HazardType;
2500b57cec5SDimitry Andric 
251e8d8bef9SDimitry Andric   if ((SIInstrInfo::isVMEM(*MI) ||
252e8d8bef9SDimitry Andric        SIInstrInfo::isFLAT(*MI) ||
253e8d8bef9SDimitry Andric        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
254e8d8bef9SDimitry Andric     return HazardType;
2550b57cec5SDimitry Andric 
2560b57cec5SDimitry Andric   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
257e8d8bef9SDimitry Andric     return HazardType;
2580b57cec5SDimitry Andric 
2590b57cec5SDimitry Andric   return NoHazard;
2600b57cec5SDimitry Andric }
2610b57cec5SDimitry Andric 
262e8d8bef9SDimitry Andric static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
263e8d8bef9SDimitry Andric                                 unsigned Quantity) {
264e8d8bef9SDimitry Andric   while (Quantity > 0) {
265e8d8bef9SDimitry Andric     unsigned Arg = std::min(Quantity, 8u);
266e8d8bef9SDimitry Andric     Quantity -= Arg;
2670b57cec5SDimitry Andric     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
268e8d8bef9SDimitry Andric         .addImm(Arg - 1);
269e8d8bef9SDimitry Andric   }
2700b57cec5SDimitry Andric }
2710b57cec5SDimitry Andric 
27281ad6265SDimitry Andric unsigned
27381ad6265SDimitry Andric GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
27481ad6265SDimitry Andric   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
27581ad6265SDimitry Andric   assert(TSchedModel.getWriteProcResBegin(SC) !=
27681ad6265SDimitry Andric          TSchedModel.getWriteProcResEnd(SC));
2775f757f3fSDimitry Andric   return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
27881ad6265SDimitry Andric }
27981ad6265SDimitry Andric 
2800b57cec5SDimitry Andric void GCNHazardRecognizer::processBundle() {
2810b57cec5SDimitry Andric   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
2820b57cec5SDimitry Andric   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
2830b57cec5SDimitry Andric   // Check bundled MachineInstr's for hazards.
2840b57cec5SDimitry Andric   for (; MI != E && MI->isInsideBundle(); ++MI) {
2850b57cec5SDimitry Andric     CurrCycleInstr = &*MI;
2860b57cec5SDimitry Andric     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
2870b57cec5SDimitry Andric 
288e8d8bef9SDimitry Andric     if (IsHazardRecognizerMode) {
2890b57cec5SDimitry Andric       fixHazards(CurrCycleInstr);
2900b57cec5SDimitry Andric 
291e8d8bef9SDimitry Andric       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
292e8d8bef9SDimitry Andric     }
2930b57cec5SDimitry Andric 
2940b57cec5SDimitry Andric     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
2950b57cec5SDimitry Andric     // include the bundled MI directly after, only add a maximum of
2960b57cec5SDimitry Andric     // (MaxLookAhead - 1) noops to EmittedInstrs.
2970b57cec5SDimitry Andric     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
2980b57cec5SDimitry Andric       EmittedInstrs.push_front(nullptr);
2990b57cec5SDimitry Andric 
3000b57cec5SDimitry Andric     EmittedInstrs.push_front(CurrCycleInstr);
3010b57cec5SDimitry Andric     EmittedInstrs.resize(MaxLookAhead);
3020b57cec5SDimitry Andric   }
3030b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
3040b57cec5SDimitry Andric }
3050b57cec5SDimitry Andric 
306bdd1243dSDimitry Andric void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
307bdd1243dSDimitry Andric   assert(IsHazardRecognizerMode);
308bdd1243dSDimitry Andric 
309bdd1243dSDimitry Andric   unsigned NumPreNoops = PreEmitNoops(MI);
310bdd1243dSDimitry Andric   EmitNoops(NumPreNoops);
311bdd1243dSDimitry Andric   if (MI->isInsideBundle())
312bdd1243dSDimitry Andric     insertNoopsInBundle(MI, TII, NumPreNoops);
313bdd1243dSDimitry Andric   else
314bdd1243dSDimitry Andric     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
315bdd1243dSDimitry Andric                     NumPreNoops);
316bdd1243dSDimitry Andric   EmitInstruction(MI);
317bdd1243dSDimitry Andric   AdvanceCycle();
318bdd1243dSDimitry Andric }
319bdd1243dSDimitry Andric 
3200b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
3210b57cec5SDimitry Andric   IsHazardRecognizerMode = true;
3220b57cec5SDimitry Andric   CurrCycleInstr = MI;
3230b57cec5SDimitry Andric   unsigned W = PreEmitNoopsCommon(MI);
3240b57cec5SDimitry Andric   fixHazards(MI);
3250b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
3260b57cec5SDimitry Andric   return W;
3270b57cec5SDimitry Andric }
3280b57cec5SDimitry Andric 
3290b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
3300b57cec5SDimitry Andric   if (MI->isBundle())
3310b57cec5SDimitry Andric     return 0;
3320b57cec5SDimitry Andric 
333e8d8bef9SDimitry Andric   int WaitStates = 0;
3340b57cec5SDimitry Andric 
3350b57cec5SDimitry Andric   if (SIInstrInfo::isSMRD(*MI))
3360b57cec5SDimitry Andric     return std::max(WaitStates, checkSMRDHazards(MI));
3370b57cec5SDimitry Andric 
3380b57cec5SDimitry Andric   if (ST.hasNSAtoVMEMBug())
3390b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
3400b57cec5SDimitry Andric 
3410b57cec5SDimitry Andric   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
3420b57cec5SDimitry Andric 
3430b57cec5SDimitry Andric   if (ST.hasNoDataDepHazard())
3440b57cec5SDimitry Andric     return WaitStates;
3450b57cec5SDimitry Andric 
346fe6060f1SDimitry Andric   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
347fe6060f1SDimitry Andric     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
348fe6060f1SDimitry Andric 
3490b57cec5SDimitry Andric   if (SIInstrInfo::isVALU(*MI))
3500b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
3510b57cec5SDimitry Andric 
3520b57cec5SDimitry Andric   if (SIInstrInfo::isDPP(*MI))
3530b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
3540b57cec5SDimitry Andric 
3550b57cec5SDimitry Andric   if (isDivFMas(MI->getOpcode()))
3560b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
3570b57cec5SDimitry Andric 
3580b57cec5SDimitry Andric   if (isRWLane(MI->getOpcode()))
3590b57cec5SDimitry Andric     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
3600b57cec5SDimitry Andric 
361fe6060f1SDimitry Andric   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
362fe6060f1SDimitry Andric        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
363fe6060f1SDimitry Andric        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
364fe6060f1SDimitry Andric     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
365fe6060f1SDimitry Andric 
3660b57cec5SDimitry Andric   if (MI->isInlineAsm())
3670b57cec5SDimitry Andric     return std::max(WaitStates, checkInlineAsmHazards(MI));
3680b57cec5SDimitry Andric 
3690b57cec5SDimitry Andric   if (isSGetReg(MI->getOpcode()))
3700b57cec5SDimitry Andric     return std::max(WaitStates, checkGetRegHazards(MI));
3710b57cec5SDimitry Andric 
3720b57cec5SDimitry Andric   if (isSSetReg(MI->getOpcode()))
3730b57cec5SDimitry Andric     return std::max(WaitStates, checkSetRegHazards(MI));
3740b57cec5SDimitry Andric 
3750b57cec5SDimitry Andric   if (isRFE(MI->getOpcode()))
3760b57cec5SDimitry Andric     return std::max(WaitStates, checkRFEHazards(MI));
3770b57cec5SDimitry Andric 
37881ad6265SDimitry Andric   if ((ST.hasReadM0MovRelInterpHazard() &&
379bdd1243dSDimitry Andric        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
380bdd1243dSDimitry Andric         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381bdd1243dSDimitry Andric         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
38281ad6265SDimitry Andric       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
38381ad6265SDimitry Andric       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
38481ad6265SDimitry Andric       (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
3850b57cec5SDimitry Andric     return std::max(WaitStates, checkReadM0Hazards(MI));
3860b57cec5SDimitry Andric 
3870b57cec5SDimitry Andric   if (SIInstrInfo::isMAI(*MI))
3880b57cec5SDimitry Andric     return std::max(WaitStates, checkMAIHazards(MI));
3890b57cec5SDimitry Andric 
390e8d8bef9SDimitry Andric   if (SIInstrInfo::isVMEM(*MI) ||
391e8d8bef9SDimitry Andric       SIInstrInfo::isFLAT(*MI) ||
392e8d8bef9SDimitry Andric       SIInstrInfo::isDS(*MI))
3930b57cec5SDimitry Andric     return std::max(WaitStates, checkMAILdStHazards(MI));
3940b57cec5SDimitry Andric 
3950b57cec5SDimitry Andric   return WaitStates;
3960b57cec5SDimitry Andric }
3970b57cec5SDimitry Andric 
3980b57cec5SDimitry Andric void GCNHazardRecognizer::EmitNoop() {
3990b57cec5SDimitry Andric   EmittedInstrs.push_front(nullptr);
4000b57cec5SDimitry Andric }
4010b57cec5SDimitry Andric 
4020b57cec5SDimitry Andric void GCNHazardRecognizer::AdvanceCycle() {
4030b57cec5SDimitry Andric   // When the scheduler detects a stall, it will call AdvanceCycle() without
4040b57cec5SDimitry Andric   // emitting any instructions.
405e8d8bef9SDimitry Andric   if (!CurrCycleInstr) {
406e8d8bef9SDimitry Andric     EmittedInstrs.push_front(nullptr);
4070b57cec5SDimitry Andric     return;
408e8d8bef9SDimitry Andric   }
4090b57cec5SDimitry Andric 
4100b57cec5SDimitry Andric   if (CurrCycleInstr->isBundle()) {
4110b57cec5SDimitry Andric     processBundle();
4120b57cec5SDimitry Andric     return;
4130b57cec5SDimitry Andric   }
4140b57cec5SDimitry Andric 
4150b57cec5SDimitry Andric   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
416349cc55cSDimitry Andric   if (!NumWaitStates) {
417349cc55cSDimitry Andric     CurrCycleInstr = nullptr;
418349cc55cSDimitry Andric     return;
419349cc55cSDimitry Andric   }
4200b57cec5SDimitry Andric 
4210b57cec5SDimitry Andric   // Keep track of emitted instructions
4220b57cec5SDimitry Andric   EmittedInstrs.push_front(CurrCycleInstr);
4230b57cec5SDimitry Andric 
4240b57cec5SDimitry Andric   // Add a nullptr for each additional wait state after the first.  Make sure
4250b57cec5SDimitry Andric   // not to add more than getMaxLookAhead() items to the list, since we
4260b57cec5SDimitry Andric   // truncate the list to that size right after this loop.
4270b57cec5SDimitry Andric   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
4280b57cec5SDimitry Andric        i < e; ++i) {
4290b57cec5SDimitry Andric     EmittedInstrs.push_front(nullptr);
4300b57cec5SDimitry Andric   }
4310b57cec5SDimitry Andric 
4320b57cec5SDimitry Andric   // getMaxLookahead() is the largest number of wait states we will ever need
4330b57cec5SDimitry Andric   // to insert, so there is no point in keeping track of more than that many
4340b57cec5SDimitry Andric   // wait states.
4350b57cec5SDimitry Andric   EmittedInstrs.resize(getMaxLookAhead());
4360b57cec5SDimitry Andric 
4370b57cec5SDimitry Andric   CurrCycleInstr = nullptr;
4380b57cec5SDimitry Andric }
4390b57cec5SDimitry Andric 
4400b57cec5SDimitry Andric void GCNHazardRecognizer::RecedeCycle() {
4410b57cec5SDimitry Andric   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
4420b57cec5SDimitry Andric }
4430b57cec5SDimitry Andric 
4440b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4450b57cec5SDimitry Andric // Helper Functions
4460b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4470b57cec5SDimitry Andric 
44881ad6265SDimitry Andric typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
44981ad6265SDimitry Andric 
450fe6060f1SDimitry Andric typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
45181ad6265SDimitry Andric typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
45281ad6265SDimitry Andric 
45381ad6265SDimitry Andric // Search for a hazard in a block and its predecessors.
45481ad6265SDimitry Andric template <typename StateT>
45581ad6265SDimitry Andric static bool
45681ad6265SDimitry Andric hasHazard(StateT State,
45781ad6265SDimitry Andric           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
45881ad6265SDimitry Andric           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
45981ad6265SDimitry Andric           const MachineBasicBlock *MBB,
46081ad6265SDimitry Andric           MachineBasicBlock::const_reverse_instr_iterator I,
46181ad6265SDimitry Andric           DenseSet<const MachineBasicBlock *> &Visited) {
46281ad6265SDimitry Andric   for (auto E = MBB->instr_rend(); I != E; ++I) {
46381ad6265SDimitry Andric     // No need to look at parent BUNDLE instructions.
46481ad6265SDimitry Andric     if (I->isBundle())
46581ad6265SDimitry Andric       continue;
46681ad6265SDimitry Andric 
46781ad6265SDimitry Andric     switch (IsHazard(State, *I)) {
46881ad6265SDimitry Andric     case HazardFound:
46981ad6265SDimitry Andric       return true;
47081ad6265SDimitry Andric     case HazardExpired:
47181ad6265SDimitry Andric       return false;
47281ad6265SDimitry Andric     default:
47381ad6265SDimitry Andric       // Continue search
47481ad6265SDimitry Andric       break;
47581ad6265SDimitry Andric     }
47681ad6265SDimitry Andric 
47781ad6265SDimitry Andric     if (I->isInlineAsm() || I->isMetaInstruction())
47881ad6265SDimitry Andric       continue;
47981ad6265SDimitry Andric 
48081ad6265SDimitry Andric     UpdateState(State, *I);
48181ad6265SDimitry Andric   }
48281ad6265SDimitry Andric 
48381ad6265SDimitry Andric   for (MachineBasicBlock *Pred : MBB->predecessors()) {
48481ad6265SDimitry Andric     if (!Visited.insert(Pred).second)
48581ad6265SDimitry Andric       continue;
48681ad6265SDimitry Andric 
48781ad6265SDimitry Andric     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
48881ad6265SDimitry Andric                   Visited))
48981ad6265SDimitry Andric       return true;
49081ad6265SDimitry Andric   }
49181ad6265SDimitry Andric 
49281ad6265SDimitry Andric   return false;
49381ad6265SDimitry Andric }
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric // Returns a minimum wait states since \p I walking all predecessors.
4960b57cec5SDimitry Andric // Only scans until \p IsExpired does not return true.
4970b57cec5SDimitry Andric // Can only be run in a hazard recognizer mode.
49881ad6265SDimitry Andric static int getWaitStatesSince(
49981ad6265SDimitry Andric     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
50081ad6265SDimitry Andric     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
50181ad6265SDimitry Andric     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
50281ad6265SDimitry Andric     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
5030b57cec5SDimitry Andric   for (auto E = MBB->instr_rend(); I != E; ++I) {
5040b57cec5SDimitry Andric     // Don't add WaitStates for parent BUNDLE instructions.
5050b57cec5SDimitry Andric     if (I->isBundle())
5060b57cec5SDimitry Andric       continue;
5070b57cec5SDimitry Andric 
508fe6060f1SDimitry Andric     if (IsHazard(*I))
5090b57cec5SDimitry Andric       return WaitStates;
5100b57cec5SDimitry Andric 
511349cc55cSDimitry Andric     if (I->isInlineAsm())
5120b57cec5SDimitry Andric       continue;
5130b57cec5SDimitry Andric 
51481ad6265SDimitry Andric     WaitStates += GetNumWaitStates(*I);
5150b57cec5SDimitry Andric 
516fe6060f1SDimitry Andric     if (IsExpired(*I, WaitStates))
5170b57cec5SDimitry Andric       return std::numeric_limits<int>::max();
5180b57cec5SDimitry Andric   }
5190b57cec5SDimitry Andric 
520fe6060f1SDimitry Andric   int MinWaitStates = std::numeric_limits<int>::max();
5210b57cec5SDimitry Andric   for (MachineBasicBlock *Pred : MBB->predecessors()) {
5220b57cec5SDimitry Andric     if (!Visited.insert(Pred).second)
5230b57cec5SDimitry Andric       continue;
5240b57cec5SDimitry Andric 
52581ad6265SDimitry Andric     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
52681ad6265SDimitry Andric                                IsExpired, Visited, GetNumWaitStates);
5270b57cec5SDimitry Andric 
528fe6060f1SDimitry Andric     MinWaitStates = std::min(MinWaitStates, W);
5290b57cec5SDimitry Andric   }
5300b57cec5SDimitry Andric 
5310b57cec5SDimitry Andric   return MinWaitStates;
5320b57cec5SDimitry Andric }
5330b57cec5SDimitry Andric 
5340b57cec5SDimitry Andric static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
535fe6060f1SDimitry Andric                               const MachineInstr *MI, IsExpiredFn IsExpired) {
5360b57cec5SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
5370b57cec5SDimitry Andric   return getWaitStatesSince(IsHazard, MI->getParent(),
5380b57cec5SDimitry Andric                             std::next(MI->getReverseIterator()),
5390b57cec5SDimitry Andric                             0, IsExpired, Visited);
5400b57cec5SDimitry Andric }
5410b57cec5SDimitry Andric 
5420b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
5430b57cec5SDimitry Andric   if (IsHazardRecognizerMode) {
544fe6060f1SDimitry Andric     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
5450b57cec5SDimitry Andric       return WaitStates >= Limit;
5460b57cec5SDimitry Andric     };
5470b57cec5SDimitry Andric     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
5480b57cec5SDimitry Andric   }
5490b57cec5SDimitry Andric 
5500b57cec5SDimitry Andric   int WaitStates = 0;
5510b57cec5SDimitry Andric   for (MachineInstr *MI : EmittedInstrs) {
5520b57cec5SDimitry Andric     if (MI) {
553fe6060f1SDimitry Andric       if (IsHazard(*MI))
5540b57cec5SDimitry Andric         return WaitStates;
5550b57cec5SDimitry Andric 
5560b57cec5SDimitry Andric       if (MI->isInlineAsm())
5570b57cec5SDimitry Andric         continue;
5580b57cec5SDimitry Andric     }
5590b57cec5SDimitry Andric     ++WaitStates;
5600b57cec5SDimitry Andric 
5610b57cec5SDimitry Andric     if (WaitStates >= Limit)
5620b57cec5SDimitry Andric       break;
5630b57cec5SDimitry Andric   }
5640b57cec5SDimitry Andric   return std::numeric_limits<int>::max();
5650b57cec5SDimitry Andric }
5660b57cec5SDimitry Andric 
5670b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
5680b57cec5SDimitry Andric                                                IsHazardFn IsHazardDef,
5690b57cec5SDimitry Andric                                                int Limit) {
5700b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5710b57cec5SDimitry Andric 
572fe6060f1SDimitry Andric   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
573fe6060f1SDimitry Andric     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
5740b57cec5SDimitry Andric   };
5750b57cec5SDimitry Andric 
5760b57cec5SDimitry Andric   return getWaitStatesSince(IsHazardFn, Limit);
5770b57cec5SDimitry Andric }
5780b57cec5SDimitry Andric 
5790b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
5800b57cec5SDimitry Andric                                                   int Limit) {
581fe6060f1SDimitry Andric   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
582fe6060f1SDimitry Andric     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
5830b57cec5SDimitry Andric   };
5840b57cec5SDimitry Andric 
5850b57cec5SDimitry Andric   return getWaitStatesSince(IsHazardFn, Limit);
5860b57cec5SDimitry Andric }
5870b57cec5SDimitry Andric 
5880b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5890b57cec5SDimitry Andric // No-op Hazard Detection
5900b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5910b57cec5SDimitry Andric 
592e8d8bef9SDimitry Andric static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
593e8d8bef9SDimitry Andric                         MCRegister Reg) {
59406c3fb27SDimitry Andric   for (MCRegUnit Unit : TRI.regunits(Reg))
59506c3fb27SDimitry Andric     BV.set(Unit);
5960b57cec5SDimitry Andric }
5970b57cec5SDimitry Andric 
5980b57cec5SDimitry Andric static void addRegsToSet(const SIRegisterInfo &TRI,
5990b57cec5SDimitry Andric                          iterator_range<MachineInstr::const_mop_iterator> Ops,
60006c3fb27SDimitry Andric                          BitVector &DefSet, BitVector &UseSet) {
6010b57cec5SDimitry Andric   for (const MachineOperand &Op : Ops) {
6020b57cec5SDimitry Andric     if (Op.isReg())
60306c3fb27SDimitry Andric       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
6040b57cec5SDimitry Andric   }
6050b57cec5SDimitry Andric }
6060b57cec5SDimitry Andric 
6070b57cec5SDimitry Andric void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
60806c3fb27SDimitry Andric   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
6090b57cec5SDimitry Andric }
6100b57cec5SDimitry Andric 
6115ffd83dbSDimitry Andric static bool breaksSMEMSoftClause(MachineInstr *MI) {
6125ffd83dbSDimitry Andric   return !SIInstrInfo::isSMRD(*MI);
6135ffd83dbSDimitry Andric }
6145ffd83dbSDimitry Andric 
6155ffd83dbSDimitry Andric static bool breaksVMEMSoftClause(MachineInstr *MI) {
6165ffd83dbSDimitry Andric   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
6175ffd83dbSDimitry Andric }
6185ffd83dbSDimitry Andric 
6190b57cec5SDimitry Andric int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
6200b57cec5SDimitry Andric   // SMEM soft clause are only present on VI+, and only matter if xnack is
6210b57cec5SDimitry Andric   // enabled.
6220b57cec5SDimitry Andric   if (!ST.isXNACKEnabled())
6230b57cec5SDimitry Andric     return 0;
6240b57cec5SDimitry Andric 
6250b57cec5SDimitry Andric   bool IsSMRD = TII.isSMRD(*MEM);
6260b57cec5SDimitry Andric 
6270b57cec5SDimitry Andric   resetClause();
6280b57cec5SDimitry Andric 
6290b57cec5SDimitry Andric   // A soft-clause is any group of consecutive SMEM instructions.  The
6300b57cec5SDimitry Andric   // instructions in this group may return out of order and/or may be
6310b57cec5SDimitry Andric   // replayed (i.e. the same instruction issued more than once).
6320b57cec5SDimitry Andric   //
6330b57cec5SDimitry Andric   // In order to handle these situations correctly we need to make sure that
6340b57cec5SDimitry Andric   // when a clause has more than one instruction, no instruction in the clause
6350b57cec5SDimitry Andric   // writes to a register that is read by another instruction in the clause
63681ad6265SDimitry Andric   // (including itself). If we encounter this situation, we need to break the
6370b57cec5SDimitry Andric   // clause by inserting a non SMEM instruction.
6380b57cec5SDimitry Andric 
6390b57cec5SDimitry Andric   for (MachineInstr *MI : EmittedInstrs) {
6400b57cec5SDimitry Andric     // When we hit a non-SMEM instruction then we have passed the start of the
6410b57cec5SDimitry Andric     // clause and we can stop.
6420b57cec5SDimitry Andric     if (!MI)
6430b57cec5SDimitry Andric       break;
6440b57cec5SDimitry Andric 
6455ffd83dbSDimitry Andric     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
6460b57cec5SDimitry Andric       break;
6470b57cec5SDimitry Andric 
6480b57cec5SDimitry Andric     addClauseInst(*MI);
6490b57cec5SDimitry Andric   }
6500b57cec5SDimitry Andric 
6510b57cec5SDimitry Andric   if (ClauseDefs.none())
6520b57cec5SDimitry Andric     return 0;
6530b57cec5SDimitry Andric 
6540b57cec5SDimitry Andric   // We need to make sure not to put loads and stores in the same clause if they
6550b57cec5SDimitry Andric   // use the same address. For now, just start a new clause whenever we see a
6560b57cec5SDimitry Andric   // store.
6570b57cec5SDimitry Andric   if (MEM->mayStore())
6580b57cec5SDimitry Andric     return 1;
6590b57cec5SDimitry Andric 
6600b57cec5SDimitry Andric   addClauseInst(*MEM);
6610b57cec5SDimitry Andric 
6620b57cec5SDimitry Andric   // If the set of defs and uses intersect then we cannot add this instruction
6630b57cec5SDimitry Andric   // to the clause, so we have a hazard.
6640b57cec5SDimitry Andric   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
6650b57cec5SDimitry Andric }
6660b57cec5SDimitry Andric 
6670b57cec5SDimitry Andric int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
6680b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
6690b57cec5SDimitry Andric 
6700b57cec5SDimitry Andric   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
6710b57cec5SDimitry Andric 
6720b57cec5SDimitry Andric   // This SMRD hazard only affects SI.
6730b57cec5SDimitry Andric   if (!ST.hasSMRDReadVALUDefHazard())
6740b57cec5SDimitry Andric     return WaitStatesNeeded;
6750b57cec5SDimitry Andric 
6760b57cec5SDimitry Andric   // A read of an SGPR by SMRD instruction requires 4 wait states when the
6770b57cec5SDimitry Andric   // SGPR was written by a VALU instruction.
6780b57cec5SDimitry Andric   int SmrdSgprWaitStates = 4;
679fe6060f1SDimitry Andric   auto IsHazardDefFn = [this](const MachineInstr &MI) {
680fe6060f1SDimitry Andric     return TII.isVALU(MI);
681fe6060f1SDimitry Andric   };
682fe6060f1SDimitry Andric   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
683fe6060f1SDimitry Andric     return TII.isSALU(MI);
684fe6060f1SDimitry Andric   };
6850b57cec5SDimitry Andric 
6860b57cec5SDimitry Andric   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
6870b57cec5SDimitry Andric 
6880b57cec5SDimitry Andric   for (const MachineOperand &Use : SMRD->uses()) {
6890b57cec5SDimitry Andric     if (!Use.isReg())
6900b57cec5SDimitry Andric       continue;
6910b57cec5SDimitry Andric     int WaitStatesNeededForUse =
6920b57cec5SDimitry Andric         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
6930b57cec5SDimitry Andric                                                    SmrdSgprWaitStates);
6940b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6950b57cec5SDimitry Andric 
6960b57cec5SDimitry Andric     // This fixes what appears to be undocumented hardware behavior in SI where
6970b57cec5SDimitry Andric     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
6980b57cec5SDimitry Andric     // needs some number of nops in between. We don't know how many we need, but
6990b57cec5SDimitry Andric     // let's use 4. This wasn't discovered before probably because the only
7000b57cec5SDimitry Andric     // case when this happens is when we expand a 64-bit pointer into a full
7010b57cec5SDimitry Andric     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
7020b57cec5SDimitry Andric     // probably never encountered in the closed-source land.
7030b57cec5SDimitry Andric     if (IsBufferSMRD) {
7040b57cec5SDimitry Andric       int WaitStatesNeededForUse =
7050b57cec5SDimitry Andric         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
7060b57cec5SDimitry Andric                                                    IsBufferHazardDefFn,
7070b57cec5SDimitry Andric                                                    SmrdSgprWaitStates);
7080b57cec5SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7090b57cec5SDimitry Andric     }
7100b57cec5SDimitry Andric   }
7110b57cec5SDimitry Andric 
7120b57cec5SDimitry Andric   return WaitStatesNeeded;
7130b57cec5SDimitry Andric }
7140b57cec5SDimitry Andric 
7150b57cec5SDimitry Andric int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
7160b57cec5SDimitry Andric   if (!ST.hasVMEMReadSGPRVALUDefHazard())
7170b57cec5SDimitry Andric     return 0;
7180b57cec5SDimitry Andric 
7190b57cec5SDimitry Andric   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
7200b57cec5SDimitry Andric 
7210b57cec5SDimitry Andric   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
7220b57cec5SDimitry Andric   // SGPR was written by a VALU Instruction.
7230b57cec5SDimitry Andric   const int VmemSgprWaitStates = 5;
724fe6060f1SDimitry Andric   auto IsHazardDefFn = [this](const MachineInstr &MI) {
725fe6060f1SDimitry Andric     return TII.isVALU(MI);
726fe6060f1SDimitry Andric   };
7270b57cec5SDimitry Andric   for (const MachineOperand &Use : VMEM->uses()) {
728fe6060f1SDimitry Andric     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
7290b57cec5SDimitry Andric       continue;
7300b57cec5SDimitry Andric 
7310b57cec5SDimitry Andric     int WaitStatesNeededForUse =
7320b57cec5SDimitry Andric         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
7330b57cec5SDimitry Andric                                                    VmemSgprWaitStates);
7340b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7350b57cec5SDimitry Andric   }
7360b57cec5SDimitry Andric   return WaitStatesNeeded;
7370b57cec5SDimitry Andric }
7380b57cec5SDimitry Andric 
7390b57cec5SDimitry Andric int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
7400b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
7410b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7420b57cec5SDimitry Andric 
7430b57cec5SDimitry Andric   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
7440b57cec5SDimitry Andric   int DppVgprWaitStates = 2;
7450b57cec5SDimitry Andric   int DppExecWaitStates = 5;
7460b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
747fe6060f1SDimitry Andric   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
748fe6060f1SDimitry Andric     return TII->isVALU(MI);
749fe6060f1SDimitry Andric   };
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric   for (const MachineOperand &Use : DPP->uses()) {
7520b57cec5SDimitry Andric     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
7530b57cec5SDimitry Andric       continue;
7540b57cec5SDimitry Andric     int WaitStatesNeededForUse =
755fe6060f1SDimitry Andric         DppVgprWaitStates - getWaitStatesSinceDef(
756fe6060f1SDimitry Andric                                 Use.getReg(),
757fe6060f1SDimitry Andric                                 [](const MachineInstr &) { return true; },
7580b57cec5SDimitry Andric                                 DppVgprWaitStates);
7590b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7600b57cec5SDimitry Andric   }
7610b57cec5SDimitry Andric 
7620b57cec5SDimitry Andric   WaitStatesNeeded = std::max(
7630b57cec5SDimitry Andric       WaitStatesNeeded,
7640b57cec5SDimitry Andric       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
7650b57cec5SDimitry Andric                                                 DppExecWaitStates));
7660b57cec5SDimitry Andric 
7670b57cec5SDimitry Andric   return WaitStatesNeeded;
7680b57cec5SDimitry Andric }
7690b57cec5SDimitry Andric 
7700b57cec5SDimitry Andric int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
7710b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7720b57cec5SDimitry Andric 
7730b57cec5SDimitry Andric   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
7740b57cec5SDimitry Andric   // instruction.
7750b57cec5SDimitry Andric   const int DivFMasWaitStates = 4;
776fe6060f1SDimitry Andric   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
777fe6060f1SDimitry Andric     return TII->isVALU(MI);
778fe6060f1SDimitry Andric   };
7790b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
7800b57cec5SDimitry Andric                                                DivFMasWaitStates);
7810b57cec5SDimitry Andric 
7820b57cec5SDimitry Andric   return DivFMasWaitStates - WaitStatesNeeded;
7830b57cec5SDimitry Andric }
7840b57cec5SDimitry Andric 
7850b57cec5SDimitry Andric int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
7860b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
7870b57cec5SDimitry Andric   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
7880b57cec5SDimitry Andric 
7890b57cec5SDimitry Andric   const int GetRegWaitStates = 2;
790fe6060f1SDimitry Andric   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
791fe6060f1SDimitry Andric     return GetRegHWReg == getHWReg(TII, MI);
7920b57cec5SDimitry Andric   };
7930b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
7940b57cec5SDimitry Andric 
7950b57cec5SDimitry Andric   return GetRegWaitStates - WaitStatesNeeded;
7960b57cec5SDimitry Andric }
7970b57cec5SDimitry Andric 
7980b57cec5SDimitry Andric int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
7990b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
8000b57cec5SDimitry Andric   unsigned HWReg = getHWReg(TII, *SetRegInstr);
8010b57cec5SDimitry Andric 
8020b57cec5SDimitry Andric   const int SetRegWaitStates = ST.getSetRegWaitStates();
803fe6060f1SDimitry Andric   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
804fe6060f1SDimitry Andric     return HWReg == getHWReg(TII, MI);
8050b57cec5SDimitry Andric   };
8060b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
8070b57cec5SDimitry Andric   return SetRegWaitStates - WaitStatesNeeded;
8080b57cec5SDimitry Andric }
8090b57cec5SDimitry Andric 
8100b57cec5SDimitry Andric int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
8110b57cec5SDimitry Andric   if (!MI.mayStore())
8120b57cec5SDimitry Andric     return -1;
8130b57cec5SDimitry Andric 
8140b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
8150b57cec5SDimitry Andric   unsigned Opcode = MI.getOpcode();
8160b57cec5SDimitry Andric   const MCInstrDesc &Desc = MI.getDesc();
8170b57cec5SDimitry Andric 
8180b57cec5SDimitry Andric   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
8190b57cec5SDimitry Andric   int VDataRCID = -1;
8200b57cec5SDimitry Andric   if (VDataIdx != -1)
821bdd1243dSDimitry Andric     VDataRCID = Desc.operands()[VDataIdx].RegClass;
8220b57cec5SDimitry Andric 
8230b57cec5SDimitry Andric   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
8240b57cec5SDimitry Andric     // There is no hazard if the instruction does not use vector regs
8250b57cec5SDimitry Andric     // (like wbinvl1)
8260b57cec5SDimitry Andric     if (VDataIdx == -1)
8270b57cec5SDimitry Andric       return -1;
8280b57cec5SDimitry Andric     // For MUBUF/MTBUF instructions this hazard only exists if the
8290b57cec5SDimitry Andric     // instruction is not using a register in the soffset field.
8300b57cec5SDimitry Andric     const MachineOperand *SOffset =
8310b57cec5SDimitry Andric         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
8320b57cec5SDimitry Andric     // If we have no soffset operand, then assume this field has been
8330b57cec5SDimitry Andric     // hardcoded to zero.
8340b57cec5SDimitry Andric     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
8350b57cec5SDimitry Andric         (!SOffset || !SOffset->isReg()))
8360b57cec5SDimitry Andric       return VDataIdx;
8370b57cec5SDimitry Andric   }
8380b57cec5SDimitry Andric 
8390b57cec5SDimitry Andric   // MIMG instructions create a hazard if they don't use a 256-bit T# and
8400b57cec5SDimitry Andric   // the store size is greater than 8 bytes and they have more than two bits
8410b57cec5SDimitry Andric   // of their dmask set.
8420b57cec5SDimitry Andric   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
8430b57cec5SDimitry Andric   if (TII->isMIMG(MI)) {
8440b57cec5SDimitry Andric     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
8450b57cec5SDimitry Andric     assert(SRsrcIdx != -1 &&
846bdd1243dSDimitry Andric            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
8470b57cec5SDimitry Andric     (void)SRsrcIdx;
8480b57cec5SDimitry Andric   }
8490b57cec5SDimitry Andric 
8500b57cec5SDimitry Andric   if (TII->isFLAT(MI)) {
8510b57cec5SDimitry Andric     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
852bdd1243dSDimitry Andric     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
8530b57cec5SDimitry Andric       return DataIdx;
8540b57cec5SDimitry Andric   }
8550b57cec5SDimitry Andric 
8560b57cec5SDimitry Andric   return -1;
8570b57cec5SDimitry Andric }
8580b57cec5SDimitry Andric 
859e8d8bef9SDimitry Andric int
860e8d8bef9SDimitry Andric GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
8610b57cec5SDimitry Andric                                             const MachineRegisterInfo &MRI) {
8620b57cec5SDimitry Andric   // Helper to check for the hazard where VMEM instructions that store more than
8630b57cec5SDimitry Andric   // 8 bytes can have there store data over written by the next instruction.
8640b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
8650b57cec5SDimitry Andric 
86681ad6265SDimitry Andric   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
8670b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
8680b57cec5SDimitry Andric 
869fe6060f1SDimitry Andric   if (!TRI->isVectorRegister(MRI, Def.getReg()))
8700b57cec5SDimitry Andric     return WaitStatesNeeded;
8718bcb0991SDimitry Andric   Register Reg = Def.getReg();
872fe6060f1SDimitry Andric   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
873fe6060f1SDimitry Andric     int DataIdx = createsVALUHazard(MI);
8740b57cec5SDimitry Andric     return DataIdx >= 0 &&
875fe6060f1SDimitry Andric            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
8760b57cec5SDimitry Andric   };
8770b57cec5SDimitry Andric   int WaitStatesNeededForDef =
8780b57cec5SDimitry Andric     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
8790b57cec5SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
8800b57cec5SDimitry Andric 
8810b57cec5SDimitry Andric   return WaitStatesNeeded;
8820b57cec5SDimitry Andric }
8830b57cec5SDimitry Andric 
8840b57cec5SDimitry Andric int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
88581ad6265SDimitry Andric   int WaitStatesNeeded = 0;
88681ad6265SDimitry Andric 
88781ad6265SDimitry Andric   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
88881ad6265SDimitry Andric     const int TransDefWaitstates = 1;
88981ad6265SDimitry Andric 
89081ad6265SDimitry Andric     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
89181ad6265SDimitry Andric       if (!SIInstrInfo::isTRANS(MI))
89281ad6265SDimitry Andric         return false;
89381ad6265SDimitry Andric       const SIRegisterInfo *TRI = ST.getRegisterInfo();
89481ad6265SDimitry Andric       const SIInstrInfo *TII = ST.getInstrInfo();
89581ad6265SDimitry Andric       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
89681ad6265SDimitry Andric 
89781ad6265SDimitry Andric       for (const MachineOperand &Use : VALU->explicit_uses()) {
89881ad6265SDimitry Andric         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
89981ad6265SDimitry Andric           return true;
90081ad6265SDimitry Andric       }
90181ad6265SDimitry Andric 
90281ad6265SDimitry Andric       return false;
90381ad6265SDimitry Andric     };
90481ad6265SDimitry Andric 
90581ad6265SDimitry Andric     int WaitStatesNeededForDef =
90681ad6265SDimitry Andric         TransDefWaitstates -
90781ad6265SDimitry Andric         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
90881ad6265SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
90981ad6265SDimitry Andric   }
91081ad6265SDimitry Andric 
91181ad6265SDimitry Andric   if (ST.hasDstSelForwardingHazard()) {
91281ad6265SDimitry Andric     const int Shift16DefWaitstates = 1;
91381ad6265SDimitry Andric 
91481ad6265SDimitry Andric     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
91581ad6265SDimitry Andric       if (!SIInstrInfo::isVALU(MI))
91681ad6265SDimitry Andric         return false;
91781ad6265SDimitry Andric       const SIInstrInfo *TII = ST.getInstrInfo();
91881ad6265SDimitry Andric       if (SIInstrInfo::isSDWA(MI)) {
91981ad6265SDimitry Andric         if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
92081ad6265SDimitry Andric           if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
92181ad6265SDimitry Andric             return false;
92281ad6265SDimitry Andric       } else {
923bdd1243dSDimitry Andric         if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
92481ad6265SDimitry Andric             !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
92581ad6265SDimitry Andric                   ->getImm() &
92681ad6265SDimitry Andric               SISrcMods::DST_OP_SEL))
92781ad6265SDimitry Andric           return false;
92881ad6265SDimitry Andric       }
92981ad6265SDimitry Andric       const SIRegisterInfo *TRI = ST.getRegisterInfo();
93081ad6265SDimitry Andric       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
93181ad6265SDimitry Andric         Register Def = Dst->getReg();
93281ad6265SDimitry Andric 
93381ad6265SDimitry Andric         for (const MachineOperand &Use : VALU->explicit_uses()) {
93481ad6265SDimitry Andric           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
93581ad6265SDimitry Andric             return true;
93681ad6265SDimitry Andric         }
93781ad6265SDimitry Andric       }
93881ad6265SDimitry Andric 
93981ad6265SDimitry Andric       return false;
94081ad6265SDimitry Andric     };
94181ad6265SDimitry Andric 
94281ad6265SDimitry Andric     int WaitStatesNeededForDef =
94381ad6265SDimitry Andric         Shift16DefWaitstates -
94481ad6265SDimitry Andric         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
94581ad6265SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
94681ad6265SDimitry Andric   }
94781ad6265SDimitry Andric 
94881ad6265SDimitry Andric   if (ST.hasVDecCoExecHazard()) {
94981ad6265SDimitry Andric     const int VALUWriteSGPRVALUReadWaitstates = 2;
95081ad6265SDimitry Andric     const int VALUWriteEXECRWLane = 4;
95181ad6265SDimitry Andric     const int VALUWriteVGPRReadlaneRead = 1;
95281ad6265SDimitry Andric 
95381ad6265SDimitry Andric     const SIRegisterInfo *TRI = ST.getRegisterInfo();
95481ad6265SDimitry Andric     const MachineRegisterInfo &MRI = MF.getRegInfo();
95581ad6265SDimitry Andric     Register UseReg;
95681ad6265SDimitry Andric     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
95781ad6265SDimitry Andric       if (!SIInstrInfo::isVALU(MI))
95881ad6265SDimitry Andric         return false;
95981ad6265SDimitry Andric       return MI.modifiesRegister(UseReg, TRI);
96081ad6265SDimitry Andric     };
96181ad6265SDimitry Andric 
96281ad6265SDimitry Andric     for (const MachineOperand &Use : VALU->explicit_uses()) {
96381ad6265SDimitry Andric       if (!Use.isReg())
96481ad6265SDimitry Andric         continue;
96581ad6265SDimitry Andric 
96681ad6265SDimitry Andric       UseReg = Use.getReg();
96781ad6265SDimitry Andric       if (TRI->isSGPRReg(MRI, UseReg)) {
96881ad6265SDimitry Andric         int WaitStatesNeededForDef =
96981ad6265SDimitry Andric             VALUWriteSGPRVALUReadWaitstates -
97081ad6265SDimitry Andric             getWaitStatesSince(IsVALUDefSGPRFn,
97181ad6265SDimitry Andric                                VALUWriteSGPRVALUReadWaitstates);
97281ad6265SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
97381ad6265SDimitry Andric       }
97481ad6265SDimitry Andric     }
97581ad6265SDimitry Andric 
97681ad6265SDimitry Andric     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
97781ad6265SDimitry Andric       UseReg = AMDGPU::VCC;
97881ad6265SDimitry Andric       int WaitStatesNeededForDef =
97981ad6265SDimitry Andric           VALUWriteSGPRVALUReadWaitstates -
98081ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
98181ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
98281ad6265SDimitry Andric     }
98381ad6265SDimitry Andric 
98481ad6265SDimitry Andric     switch (VALU->getOpcode()) {
98581ad6265SDimitry Andric     case AMDGPU::V_READLANE_B32:
98681ad6265SDimitry Andric     case AMDGPU::V_READFIRSTLANE_B32: {
98781ad6265SDimitry Andric       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
98881ad6265SDimitry Andric       UseReg = Src->getReg();
98981ad6265SDimitry Andric       int WaitStatesNeededForDef =
99081ad6265SDimitry Andric           VALUWriteVGPRReadlaneRead -
99181ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
99281ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
99381ad6265SDimitry Andric     }
994bdd1243dSDimitry Andric       [[fallthrough]];
99581ad6265SDimitry Andric     case AMDGPU::V_WRITELANE_B32: {
99681ad6265SDimitry Andric       UseReg = AMDGPU::EXEC;
99781ad6265SDimitry Andric       int WaitStatesNeededForDef =
99881ad6265SDimitry Andric           VALUWriteEXECRWLane -
99981ad6265SDimitry Andric           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
100081ad6265SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
100181ad6265SDimitry Andric       break;
100281ad6265SDimitry Andric     }
100381ad6265SDimitry Andric     default:
100481ad6265SDimitry Andric       break;
100581ad6265SDimitry Andric     }
100681ad6265SDimitry Andric   }
100781ad6265SDimitry Andric 
10080b57cec5SDimitry Andric   // This checks for the hazard where VMEM instructions that store more than
10090b57cec5SDimitry Andric   // 8 bytes can have there store data over written by the next instruction.
10100b57cec5SDimitry Andric   if (!ST.has12DWordStoreHazard())
101181ad6265SDimitry Andric     return WaitStatesNeeded;
10120b57cec5SDimitry Andric 
10130b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10140b57cec5SDimitry Andric 
10150b57cec5SDimitry Andric   for (const MachineOperand &Def : VALU->defs()) {
10160b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
10170b57cec5SDimitry Andric   }
10180b57cec5SDimitry Andric 
10190b57cec5SDimitry Andric   return WaitStatesNeeded;
10200b57cec5SDimitry Andric }
10210b57cec5SDimitry Andric 
10220b57cec5SDimitry Andric int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10230b57cec5SDimitry Andric   // This checks for hazards associated with inline asm statements.
10240b57cec5SDimitry Andric   // Since inline asms can contain just about anything, we use this
10250b57cec5SDimitry Andric   // to call/leverage other check*Hazard routines. Note that
10260b57cec5SDimitry Andric   // this function doesn't attempt to address all possible inline asm
10270b57cec5SDimitry Andric   // hazards (good luck), but is a collection of what has been
10280b57cec5SDimitry Andric   // problematic thus far.
10290b57cec5SDimitry Andric 
10300b57cec5SDimitry Andric   // see checkVALUHazards()
10310b57cec5SDimitry Andric   if (!ST.has12DWordStoreHazard())
10320b57cec5SDimitry Andric     return 0;
10330b57cec5SDimitry Andric 
10340b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10350b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
10360b57cec5SDimitry Andric 
103706c3fb27SDimitry Andric   for (const MachineOperand &Op :
103806c3fb27SDimitry Andric        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
10390b57cec5SDimitry Andric     if (Op.isReg() && Op.isDef()) {
104006c3fb27SDimitry Andric       WaitStatesNeeded =
104106c3fb27SDimitry Andric           std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
10420b57cec5SDimitry Andric     }
10430b57cec5SDimitry Andric   }
10440b57cec5SDimitry Andric 
10450b57cec5SDimitry Andric   return WaitStatesNeeded;
10460b57cec5SDimitry Andric }
10470b57cec5SDimitry Andric 
10480b57cec5SDimitry Andric int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
10490b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
10500b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
10510b57cec5SDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
10520b57cec5SDimitry Andric 
10530b57cec5SDimitry Andric   const MachineOperand *LaneSelectOp =
10540b57cec5SDimitry Andric       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
10550b57cec5SDimitry Andric 
10560b57cec5SDimitry Andric   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
10570b57cec5SDimitry Andric     return 0;
10580b57cec5SDimitry Andric 
10598bcb0991SDimitry Andric   Register LaneSelectReg = LaneSelectOp->getReg();
1060fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
10610b57cec5SDimitry Andric 
10620b57cec5SDimitry Andric   const int RWLaneWaitStates = 4;
10630b57cec5SDimitry Andric   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
10640b57cec5SDimitry Andric                                               RWLaneWaitStates);
10650b57cec5SDimitry Andric   return RWLaneWaitStates - WaitStatesSince;
10660b57cec5SDimitry Andric }
10670b57cec5SDimitry Andric 
10680b57cec5SDimitry Andric int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
10690b57cec5SDimitry Andric   if (!ST.hasRFEHazards())
10700b57cec5SDimitry Andric     return 0;
10710b57cec5SDimitry Andric 
10720b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
10730b57cec5SDimitry Andric 
10740b57cec5SDimitry Andric   const int RFEWaitStates = 1;
10750b57cec5SDimitry Andric 
1076fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) {
1077fe6060f1SDimitry Andric     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
10780b57cec5SDimitry Andric   };
10790b57cec5SDimitry Andric   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
10800b57cec5SDimitry Andric   return RFEWaitStates - WaitStatesNeeded;
10810b57cec5SDimitry Andric }
10820b57cec5SDimitry Andric 
10830b57cec5SDimitry Andric int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
10840b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
108581ad6265SDimitry Andric   const int ReadM0WaitStates = 1;
1086fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
108781ad6265SDimitry Andric   return ReadM0WaitStates -
108881ad6265SDimitry Andric          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
10890b57cec5SDimitry Andric }
10900b57cec5SDimitry Andric 
10910b57cec5SDimitry Andric void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
10920b57cec5SDimitry Andric   fixVMEMtoScalarWriteHazards(MI);
10930b57cec5SDimitry Andric   fixVcmpxPermlaneHazards(MI);
10940b57cec5SDimitry Andric   fixSMEMtoVectorWriteHazards(MI);
10950b57cec5SDimitry Andric   fixVcmpxExecWARHazard(MI);
10960b57cec5SDimitry Andric   fixLdsBranchVmemWARHazard(MI);
109781ad6265SDimitry Andric   if (ST.hasLdsDirect()) {
109881ad6265SDimitry Andric     fixLdsDirectVALUHazard(MI);
109981ad6265SDimitry Andric     fixLdsDirectVMEMHazard(MI);
110081ad6265SDimitry Andric   }
110181ad6265SDimitry Andric   fixVALUPartialForwardingHazard(MI);
110281ad6265SDimitry Andric   fixVALUTransUseHazard(MI);
110381ad6265SDimitry Andric   fixWMMAHazards(MI);
1104bdd1243dSDimitry Andric   fixShift64HighRegBug(MI);
1105bdd1243dSDimitry Andric   fixVALUMaskWriteHazard(MI);
11060b57cec5SDimitry Andric }
11070b57cec5SDimitry Andric 
11080b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
11090b57cec5SDimitry Andric   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
11100b57cec5SDimitry Andric     return false;
11110b57cec5SDimitry Andric 
11120b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
111381ad6265SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
111481ad6265SDimitry Andric   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
111581ad6265SDimitry Andric     return (TII->isVOPC(MI) ||
111681ad6265SDimitry Andric             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
111781ad6265SDimitry Andric            MI.modifiesRegister(AMDGPU::EXEC, TRI);
111881ad6265SDimitry Andric   };
11190b57cec5SDimitry Andric 
1120fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1121fe6060f1SDimitry Andric     unsigned Opc = MI.getOpcode();
1122fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1123fe6060f1SDimitry Andric            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
11240b57cec5SDimitry Andric   };
11250b57cec5SDimitry Andric 
11260b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11270b57cec5SDimitry Andric       std::numeric_limits<int>::max())
11280b57cec5SDimitry Andric     return false;
11290b57cec5SDimitry Andric 
11300b57cec5SDimitry Andric   // V_NOP will be discarded by SQ.
113181ad6265SDimitry Andric   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
11320b57cec5SDimitry Andric   // which is always a VGPR and available.
11330b57cec5SDimitry Andric   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
11348bcb0991SDimitry Andric   Register Reg = Src0->getReg();
11350b57cec5SDimitry Andric   bool IsUndef = Src0->isUndef();
11360b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
11370b57cec5SDimitry Andric           TII->get(AMDGPU::V_MOV_B32_e32))
11380b57cec5SDimitry Andric     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
11390b57cec5SDimitry Andric     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
11400b57cec5SDimitry Andric 
11410b57cec5SDimitry Andric   return true;
11420b57cec5SDimitry Andric }
11430b57cec5SDimitry Andric 
11440b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
11450b57cec5SDimitry Andric   if (!ST.hasVMEMtoScalarWriteHazard())
11460b57cec5SDimitry Andric     return false;
11477a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
11480b57cec5SDimitry Andric 
11490b57cec5SDimitry Andric   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
11500b57cec5SDimitry Andric     return false;
11510b57cec5SDimitry Andric 
11520b57cec5SDimitry Andric   if (MI->getNumDefs() == 0)
11530b57cec5SDimitry Andric     return false;
11540b57cec5SDimitry Andric 
11550b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
11560b57cec5SDimitry Andric 
1157fe6060f1SDimitry Andric   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1158fe6060f1SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1159fe6060f1SDimitry Andric         !SIInstrInfo::isFLAT(I))
11600b57cec5SDimitry Andric       return false;
11610b57cec5SDimitry Andric 
11620b57cec5SDimitry Andric     for (const MachineOperand &Def : MI->defs()) {
1163fe6060f1SDimitry Andric       const MachineOperand *Op =
1164fe6060f1SDimitry Andric           I.findRegisterUseOperand(Def.getReg(), false, TRI);
11650b57cec5SDimitry Andric       if (!Op)
11660b57cec5SDimitry Andric         continue;
11670b57cec5SDimitry Andric       return true;
11680b57cec5SDimitry Andric     }
11690b57cec5SDimitry Andric     return false;
11700b57cec5SDimitry Andric   };
11710b57cec5SDimitry Andric 
1172fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1173fe6060f1SDimitry Andric     return SIInstrInfo::isVALU(MI) ||
1174fe6060f1SDimitry Andric            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1175fe6060f1SDimitry Andric             !MI.getOperand(0).getImm()) ||
1176fe6060f1SDimitry Andric            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
117706c3fb27SDimitry Andric             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
11780b57cec5SDimitry Andric   };
11790b57cec5SDimitry Andric 
11800b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11810b57cec5SDimitry Andric       std::numeric_limits<int>::max())
11820b57cec5SDimitry Andric     return false;
11830b57cec5SDimitry Andric 
11840b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1185e8d8bef9SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1186e8d8bef9SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
118706c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
11880b57cec5SDimitry Andric   return true;
11890b57cec5SDimitry Andric }
11900b57cec5SDimitry Andric 
11910b57cec5SDimitry Andric bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
11920b57cec5SDimitry Andric   if (!ST.hasSMEMtoVectorWriteHazard())
11930b57cec5SDimitry Andric     return false;
11947a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
11950b57cec5SDimitry Andric 
11960b57cec5SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
11970b57cec5SDimitry Andric     return false;
11980b57cec5SDimitry Andric 
11990b57cec5SDimitry Andric   unsigned SDSTName;
12000b57cec5SDimitry Andric   switch (MI->getOpcode()) {
12010b57cec5SDimitry Andric   case AMDGPU::V_READLANE_B32:
12020b57cec5SDimitry Andric   case AMDGPU::V_READFIRSTLANE_B32:
12030b57cec5SDimitry Andric     SDSTName = AMDGPU::OpName::vdst;
12040b57cec5SDimitry Andric     break;
12050b57cec5SDimitry Andric   default:
12060b57cec5SDimitry Andric     SDSTName = AMDGPU::OpName::sdst;
12070b57cec5SDimitry Andric     break;
12080b57cec5SDimitry Andric   }
12090b57cec5SDimitry Andric 
12100b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
12110b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
12120b57cec5SDimitry Andric   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
12130b57cec5SDimitry Andric   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
12140b57cec5SDimitry Andric   if (!SDST) {
12150b57cec5SDimitry Andric     for (const auto &MO : MI->implicit_operands()) {
1216bdd1243dSDimitry Andric       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
12170b57cec5SDimitry Andric         SDST = &MO;
12180b57cec5SDimitry Andric         break;
12190b57cec5SDimitry Andric       }
12200b57cec5SDimitry Andric     }
12210b57cec5SDimitry Andric   }
12220b57cec5SDimitry Andric 
12230b57cec5SDimitry Andric   if (!SDST)
12240b57cec5SDimitry Andric     return false;
12250b57cec5SDimitry Andric 
12268bcb0991SDimitry Andric   const Register SDSTReg = SDST->getReg();
1227fe6060f1SDimitry Andric   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1228fe6060f1SDimitry Andric     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
12290b57cec5SDimitry Andric   };
12300b57cec5SDimitry Andric 
1231fe6060f1SDimitry Andric   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1232fe6060f1SDimitry Andric     if (TII->isSALU(MI)) {
1233fe6060f1SDimitry Andric       switch (MI.getOpcode()) {
12340b57cec5SDimitry Andric       case AMDGPU::S_SETVSKIP:
12350b57cec5SDimitry Andric       case AMDGPU::S_VERSION:
12360b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_VSCNT:
12370b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_VMCNT:
12380b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_EXPCNT:
12390b57cec5SDimitry Andric         // These instructions cannot not mitigate the hazard.
12400b57cec5SDimitry Andric         return false;
12410b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT_LGKMCNT:
12420b57cec5SDimitry Andric         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1243fe6060f1SDimitry Andric         return (MI.getOperand(1).getImm() == 0) &&
1244fe6060f1SDimitry Andric                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
12450b57cec5SDimitry Andric       case AMDGPU::S_WAITCNT: {
1246fe6060f1SDimitry Andric         const int64_t Imm = MI.getOperand(0).getImm();
12470b57cec5SDimitry Andric         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
12487a6dacacSDimitry Andric         // DsCnt corresponds to LGKMCnt here.
12497a6dacacSDimitry Andric         return (Decoded.DsCnt == 0);
12500b57cec5SDimitry Andric       }
12510b57cec5SDimitry Andric       default:
12520b57cec5SDimitry Andric         // SOPP instructions cannot mitigate the hazard.
1253fe6060f1SDimitry Andric         if (TII->isSOPP(MI))
12540b57cec5SDimitry Andric           return false;
12550b57cec5SDimitry Andric         // At this point the SALU can be assumed to mitigate the hazard
12560b57cec5SDimitry Andric         // because either:
12570b57cec5SDimitry Andric         // (a) it is independent of the at risk SMEM (breaking chain),
12580b57cec5SDimitry Andric         // or
12590b57cec5SDimitry Andric         // (b) it is dependent on the SMEM, in which case an appropriate
12600b57cec5SDimitry Andric         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
12610b57cec5SDimitry Andric         //     SMEM instruction.
12620b57cec5SDimitry Andric         return true;
12630b57cec5SDimitry Andric       }
12640b57cec5SDimitry Andric     }
12650b57cec5SDimitry Andric     return false;
12660b57cec5SDimitry Andric   };
12670b57cec5SDimitry Andric 
12680b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
12690b57cec5SDimitry Andric       std::numeric_limits<int>::max())
12700b57cec5SDimitry Andric     return false;
12710b57cec5SDimitry Andric 
12720b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
12730b57cec5SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
12740b57cec5SDimitry Andric       .addImm(0);
12750b57cec5SDimitry Andric   return true;
12760b57cec5SDimitry Andric }
12770b57cec5SDimitry Andric 
12780b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
12797a6dacacSDimitry Andric   if (!ST.hasVcmpxExecWARHazard())
12807a6dacacSDimitry Andric     return false;
12817a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
12827a6dacacSDimitry Andric 
12837a6dacacSDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
12840b57cec5SDimitry Andric     return false;
12850b57cec5SDimitry Andric 
12860b57cec5SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
12870b57cec5SDimitry Andric   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
12880b57cec5SDimitry Andric     return false;
12890b57cec5SDimitry Andric 
1290fe6060f1SDimitry Andric   auto IsHazardFn = [TRI](const MachineInstr &I) {
1291fe6060f1SDimitry Andric     if (SIInstrInfo::isVALU(I))
12920b57cec5SDimitry Andric       return false;
1293fe6060f1SDimitry Andric     return I.readsRegister(AMDGPU::EXEC, TRI);
12940b57cec5SDimitry Andric   };
12950b57cec5SDimitry Andric 
12960b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
1297fe6060f1SDimitry Andric   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1298fe6060f1SDimitry Andric     if (SIInstrInfo::isVALU(MI)) {
1299fe6060f1SDimitry Andric       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
13000b57cec5SDimitry Andric         return true;
1301fe6060f1SDimitry Andric       for (auto MO : MI.implicit_operands())
1302bdd1243dSDimitry Andric         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
13030b57cec5SDimitry Andric           return true;
13040b57cec5SDimitry Andric     }
1305fe6060f1SDimitry Andric     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
130606c3fb27SDimitry Andric         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
13070b57cec5SDimitry Andric       return true;
13080b57cec5SDimitry Andric     return false;
13090b57cec5SDimitry Andric   };
13100b57cec5SDimitry Andric 
13110b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13120b57cec5SDimitry Andric       std::numeric_limits<int>::max())
13130b57cec5SDimitry Andric     return false;
13140b57cec5SDimitry Andric 
13150b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
13160b57cec5SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
131706c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
13180b57cec5SDimitry Andric   return true;
13190b57cec5SDimitry Andric }
13200b57cec5SDimitry Andric 
1321fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1322fe6060f1SDimitry Andric                                                  const GCNSubtarget &ST) {
13230b57cec5SDimitry Andric   if (!ST.hasLdsBranchVmemWARHazard())
13240b57cec5SDimitry Andric     return false;
13250b57cec5SDimitry Andric 
1326fe6060f1SDimitry Andric   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1327fe6060f1SDimitry Andric   // instructions need to appear in the same function.
1328fe6060f1SDimitry Andric   bool HasLds = false;
1329fe6060f1SDimitry Andric   bool HasVmem = false;
1330fe6060f1SDimitry Andric   for (auto &MBB : MF) {
1331fe6060f1SDimitry Andric     for (auto &MI : MBB) {
1332fe6060f1SDimitry Andric       HasLds |= SIInstrInfo::isDS(MI);
1333fe6060f1SDimitry Andric       HasVmem |=
1334fe6060f1SDimitry Andric           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1335fe6060f1SDimitry Andric       if (HasLds && HasVmem)
1336fe6060f1SDimitry Andric         return true;
1337fe6060f1SDimitry Andric     }
1338fe6060f1SDimitry Andric   }
1339fe6060f1SDimitry Andric   return false;
1340fe6060f1SDimitry Andric }
1341fe6060f1SDimitry Andric 
1342bdd1243dSDimitry Andric static bool isStoreCountWaitZero(const MachineInstr &I) {
1343bdd1243dSDimitry Andric   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1344bdd1243dSDimitry Andric          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1345bdd1243dSDimitry Andric          !I.getOperand(1).getImm();
1346bdd1243dSDimitry Andric }
1347bdd1243dSDimitry Andric 
1348fe6060f1SDimitry Andric bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1349fe6060f1SDimitry Andric   if (!RunLdsBranchVmemWARHazardFixup)
1350fe6060f1SDimitry Andric     return false;
1351fe6060f1SDimitry Andric 
1352fe6060f1SDimitry Andric   assert(ST.hasLdsBranchVmemWARHazard());
13537a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
1354fe6060f1SDimitry Andric 
1355fe6060f1SDimitry Andric   auto IsHazardInst = [](const MachineInstr &MI) {
1356fe6060f1SDimitry Andric     if (SIInstrInfo::isDS(MI))
13570b57cec5SDimitry Andric       return 1;
1358fe6060f1SDimitry Andric     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
13590b57cec5SDimitry Andric       return 2;
13600b57cec5SDimitry Andric     return 0;
13610b57cec5SDimitry Andric   };
13620b57cec5SDimitry Andric 
1363fe6060f1SDimitry Andric   auto InstType = IsHazardInst(*MI);
13640b57cec5SDimitry Andric   if (!InstType)
13650b57cec5SDimitry Andric     return false;
13660b57cec5SDimitry Andric 
1367fe6060f1SDimitry Andric   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1368bdd1243dSDimitry Andric     return IsHazardInst(I) || isStoreCountWaitZero(I);
13690b57cec5SDimitry Andric   };
13700b57cec5SDimitry Andric 
1371fe6060f1SDimitry Andric   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1372fe6060f1SDimitry Andric     if (!I.isBranch())
13730b57cec5SDimitry Andric       return false;
13740b57cec5SDimitry Andric 
1375fe6060f1SDimitry Andric     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
13760b57cec5SDimitry Andric       auto InstType2 = IsHazardInst(I);
13770b57cec5SDimitry Andric       return InstType2 && InstType != InstType2;
13780b57cec5SDimitry Andric     };
13790b57cec5SDimitry Andric 
1380fe6060f1SDimitry Andric     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
13810b57cec5SDimitry Andric       auto InstType2 = IsHazardInst(I);
13820b57cec5SDimitry Andric       if (InstType == InstType2)
13830b57cec5SDimitry Andric         return true;
13840b57cec5SDimitry Andric 
1385bdd1243dSDimitry Andric       return isStoreCountWaitZero(I);
13860b57cec5SDimitry Andric     };
13870b57cec5SDimitry Andric 
1388fe6060f1SDimitry Andric     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
13890b57cec5SDimitry Andric            std::numeric_limits<int>::max();
13900b57cec5SDimitry Andric   };
13910b57cec5SDimitry Andric 
13920b57cec5SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13930b57cec5SDimitry Andric       std::numeric_limits<int>::max())
13940b57cec5SDimitry Andric     return false;
13950b57cec5SDimitry Andric 
13960b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
13970b57cec5SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
13980b57cec5SDimitry Andric           TII->get(AMDGPU::S_WAITCNT_VSCNT))
13990b57cec5SDimitry Andric     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
14000b57cec5SDimitry Andric     .addImm(0);
14010b57cec5SDimitry Andric 
14020b57cec5SDimitry Andric   return true;
14030b57cec5SDimitry Andric }
14040b57cec5SDimitry Andric 
140581ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
140681ad6265SDimitry Andric   if (!SIInstrInfo::isLDSDIR(*MI))
140781ad6265SDimitry Andric     return false;
140881ad6265SDimitry Andric 
140981ad6265SDimitry Andric   const int NoHazardWaitStates = 15;
141081ad6265SDimitry Andric   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
141181ad6265SDimitry Andric   const Register VDSTReg = VDST->getReg();
141281ad6265SDimitry Andric 
141381ad6265SDimitry Andric   bool VisitedTrans = false;
141481ad6265SDimitry Andric   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
141581ad6265SDimitry Andric     if (!SIInstrInfo::isVALU(I))
141681ad6265SDimitry Andric       return false;
141781ad6265SDimitry Andric     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
141881ad6265SDimitry Andric     // Cover both WAR and WAW
141981ad6265SDimitry Andric     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
142081ad6265SDimitry Andric   };
142181ad6265SDimitry Andric   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
142281ad6265SDimitry Andric     if (WaitStates >= NoHazardWaitStates)
142381ad6265SDimitry Andric       return true;
142481ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
142581ad6265SDimitry Andric     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
142681ad6265SDimitry Andric            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
142781ad6265SDimitry Andric   };
142881ad6265SDimitry Andric   auto GetWaitStatesFn = [](const MachineInstr &MI) {
142981ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) ? 1 : 0;
143081ad6265SDimitry Andric   };
143181ad6265SDimitry Andric 
143281ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
143381ad6265SDimitry Andric   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
143481ad6265SDimitry Andric                                     std::next(MI->getReverseIterator()), 0,
143581ad6265SDimitry Andric                                     IsExpiredFn, Visited, GetWaitStatesFn);
143681ad6265SDimitry Andric 
143781ad6265SDimitry Andric   // Transcendentals can execute in parallel to other VALUs.
143881ad6265SDimitry Andric   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
143981ad6265SDimitry Andric   if (VisitedTrans)
144081ad6265SDimitry Andric     Count = 0;
144181ad6265SDimitry Andric 
144281ad6265SDimitry Andric   MachineOperand *WaitVdstOp =
144381ad6265SDimitry Andric       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
144481ad6265SDimitry Andric   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
144581ad6265SDimitry Andric 
144681ad6265SDimitry Andric   return true;
144781ad6265SDimitry Andric }
144881ad6265SDimitry Andric 
144981ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
145081ad6265SDimitry Andric   if (!SIInstrInfo::isLDSDIR(*MI))
145181ad6265SDimitry Andric     return false;
145281ad6265SDimitry Andric 
145381ad6265SDimitry Andric   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
145481ad6265SDimitry Andric   const Register VDSTReg = VDST->getReg();
145581ad6265SDimitry Andric 
145681ad6265SDimitry Andric   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
145781ad6265SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
145881ad6265SDimitry Andric         !SIInstrInfo::isDS(I))
145981ad6265SDimitry Andric       return false;
146081ad6265SDimitry Andric     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
146181ad6265SDimitry Andric   };
1462297eecfbSDimitry Andric   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
14637a6dacacSDimitry Andric   // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
14647a6dacacSDimitry Andric   // according to the type of VMEM instruction.
1465297eecfbSDimitry Andric   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
146681ad6265SDimitry Andric     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
146781ad6265SDimitry Andric            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
146881ad6265SDimitry Andric            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1469297eecfbSDimitry Andric             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1470297eecfbSDimitry Andric            (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1471297eecfbSDimitry Andric             !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
147281ad6265SDimitry Andric   };
147381ad6265SDimitry Andric 
147481ad6265SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
147581ad6265SDimitry Andric       std::numeric_limits<int>::max())
147681ad6265SDimitry Andric     return false;
147781ad6265SDimitry Andric 
1478297eecfbSDimitry Andric   if (LdsdirCanWait) {
1479297eecfbSDimitry Andric     TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1480297eecfbSDimitry Andric   } else {
148181ad6265SDimitry Andric     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
148281ad6265SDimitry Andric             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
148306c3fb27SDimitry Andric         .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1484297eecfbSDimitry Andric   }
148581ad6265SDimitry Andric 
148681ad6265SDimitry Andric   return true;
148781ad6265SDimitry Andric }
148881ad6265SDimitry Andric 
148981ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
149081ad6265SDimitry Andric   if (!ST.hasVALUPartialForwardingHazard())
149181ad6265SDimitry Andric     return false;
14927a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
14937a6dacacSDimitry Andric 
14947a6dacacSDimitry Andric   if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
149581ad6265SDimitry Andric     return false;
149681ad6265SDimitry Andric 
149781ad6265SDimitry Andric   SmallSetVector<Register, 4> SrcVGPRs;
149881ad6265SDimitry Andric 
149981ad6265SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
150081ad6265SDimitry Andric     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
150181ad6265SDimitry Andric       SrcVGPRs.insert(Use.getReg());
150281ad6265SDimitry Andric   }
150381ad6265SDimitry Andric 
150481ad6265SDimitry Andric   // Only applies with >= 2 unique VGPR sources
150581ad6265SDimitry Andric   if (SrcVGPRs.size() <= 1)
150681ad6265SDimitry Andric     return false;
150781ad6265SDimitry Andric 
150881ad6265SDimitry Andric   // Look for the following pattern:
150981ad6265SDimitry Andric   //   Va <- VALU [PreExecPos]
151081ad6265SDimitry Andric   //   intv1
151181ad6265SDimitry Andric   //   Exec <- SALU [ExecPos]
151281ad6265SDimitry Andric   //   intv2
151381ad6265SDimitry Andric   //   Vb <- VALU [PostExecPos]
151481ad6265SDimitry Andric   //   intv3
151581ad6265SDimitry Andric   //   MI Va, Vb (WaitState = 0)
151681ad6265SDimitry Andric   //
151781ad6265SDimitry Andric   // Where:
151881ad6265SDimitry Andric   // intv1 + intv2 <= 2 VALUs
151981ad6265SDimitry Andric   // intv3 <= 4 VALUs
152081ad6265SDimitry Andric   //
152181ad6265SDimitry Andric   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
152281ad6265SDimitry Andric 
152381ad6265SDimitry Andric   const int Intv1plus2MaxVALUs = 2;
152481ad6265SDimitry Andric   const int Intv3MaxVALUs = 4;
152581ad6265SDimitry Andric   const int IntvMaxVALUs = 6;
152681ad6265SDimitry Andric   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
152781ad6265SDimitry Andric 
152881ad6265SDimitry Andric   struct StateType {
152981ad6265SDimitry Andric     SmallDenseMap<Register, int, 4> DefPos;
153081ad6265SDimitry Andric     int ExecPos = std::numeric_limits<int>::max();
153181ad6265SDimitry Andric     int VALUs = 0;
153281ad6265SDimitry Andric   };
153381ad6265SDimitry Andric 
153481ad6265SDimitry Andric   StateType State;
153581ad6265SDimitry Andric 
153681ad6265SDimitry Andric   // This overloads expiry testing with all the hazard detection
153781ad6265SDimitry Andric   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
153881ad6265SDimitry Andric     // Too many VALU states have passed
153981ad6265SDimitry Andric     if (State.VALUs > NoHazardVALUWaitStates)
154081ad6265SDimitry Andric       return HazardExpired;
154181ad6265SDimitry Andric 
154281ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
154381ad6265SDimitry Andric     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
154481ad6265SDimitry Andric         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
154581ad6265SDimitry Andric         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
154606c3fb27SDimitry Andric          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
154781ad6265SDimitry Andric       return HazardExpired;
154881ad6265SDimitry Andric 
154981ad6265SDimitry Andric     // Track registers writes
155081ad6265SDimitry Andric     bool Changed = false;
155181ad6265SDimitry Andric     if (SIInstrInfo::isVALU(I)) {
155281ad6265SDimitry Andric       for (Register Src : SrcVGPRs) {
155381ad6265SDimitry Andric         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
155481ad6265SDimitry Andric           State.DefPos[Src] = State.VALUs;
155581ad6265SDimitry Andric           Changed = true;
155681ad6265SDimitry Andric         }
155781ad6265SDimitry Andric       }
155881ad6265SDimitry Andric     } else if (SIInstrInfo::isSALU(I)) {
155981ad6265SDimitry Andric       if (State.ExecPos == std::numeric_limits<int>::max()) {
156081ad6265SDimitry Andric         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
156181ad6265SDimitry Andric           State.ExecPos = State.VALUs;
156281ad6265SDimitry Andric           Changed = true;
156381ad6265SDimitry Andric         }
156481ad6265SDimitry Andric       }
156581ad6265SDimitry Andric     }
156681ad6265SDimitry Andric 
156781ad6265SDimitry Andric     // Early expiration: too many VALUs in intv3
156881ad6265SDimitry Andric     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
156981ad6265SDimitry Andric       return HazardExpired;
157081ad6265SDimitry Andric 
157181ad6265SDimitry Andric     // Only evaluate state if something changed
157281ad6265SDimitry Andric     if (!Changed)
157381ad6265SDimitry Andric       return NoHazardFound;
157481ad6265SDimitry Andric 
157581ad6265SDimitry Andric     // Determine positions of VALUs pre/post exec change
157681ad6265SDimitry Andric     if (State.ExecPos == std::numeric_limits<int>::max())
157781ad6265SDimitry Andric       return NoHazardFound;
157881ad6265SDimitry Andric 
157981ad6265SDimitry Andric     int PreExecPos = std::numeric_limits<int>::max();
158081ad6265SDimitry Andric     int PostExecPos = std::numeric_limits<int>::max();
158181ad6265SDimitry Andric 
158281ad6265SDimitry Andric     for (auto Entry : State.DefPos) {
158381ad6265SDimitry Andric       int DefVALUs = Entry.second;
158481ad6265SDimitry Andric       if (DefVALUs != std::numeric_limits<int>::max()) {
158581ad6265SDimitry Andric         if (DefVALUs >= State.ExecPos)
158681ad6265SDimitry Andric           PreExecPos = std::min(PreExecPos, DefVALUs);
158781ad6265SDimitry Andric         else if (DefVALUs < State.ExecPos)
158881ad6265SDimitry Andric           PostExecPos = std::min(PostExecPos, DefVALUs);
158981ad6265SDimitry Andric       }
159081ad6265SDimitry Andric     }
159181ad6265SDimitry Andric 
159281ad6265SDimitry Andric     // Need a VALUs post exec change
159381ad6265SDimitry Andric     if (PostExecPos == std::numeric_limits<int>::max())
159481ad6265SDimitry Andric       return NoHazardFound;
159581ad6265SDimitry Andric 
159681ad6265SDimitry Andric     // Too many VALUs in intv3?
159781ad6265SDimitry Andric     int Intv3VALUs = PostExecPos;
159881ad6265SDimitry Andric     if (Intv3VALUs > Intv3MaxVALUs)
159981ad6265SDimitry Andric       return HazardExpired;
160081ad6265SDimitry Andric 
160181ad6265SDimitry Andric     // Too many VALUs in intv2?
160281ad6265SDimitry Andric     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
160381ad6265SDimitry Andric     if (Intv2VALUs > Intv1plus2MaxVALUs)
160481ad6265SDimitry Andric       return HazardExpired;
160581ad6265SDimitry Andric 
160681ad6265SDimitry Andric     // Need a VALUs pre exec change
160781ad6265SDimitry Andric     if (PreExecPos == std::numeric_limits<int>::max())
160881ad6265SDimitry Andric       return NoHazardFound;
160981ad6265SDimitry Andric 
161081ad6265SDimitry Andric     // Too many VALUs in intv1?
161181ad6265SDimitry Andric     int Intv1VALUs = PreExecPos - State.ExecPos;
161281ad6265SDimitry Andric     if (Intv1VALUs > Intv1plus2MaxVALUs)
161381ad6265SDimitry Andric       return HazardExpired;
161481ad6265SDimitry Andric 
161581ad6265SDimitry Andric     // Too many VALUs in intv1 + intv2
161681ad6265SDimitry Andric     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
161781ad6265SDimitry Andric       return HazardExpired;
161881ad6265SDimitry Andric 
161981ad6265SDimitry Andric     return HazardFound;
162081ad6265SDimitry Andric   };
162181ad6265SDimitry Andric   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
162281ad6265SDimitry Andric     if (SIInstrInfo::isVALU(MI))
162381ad6265SDimitry Andric       State.VALUs += 1;
162481ad6265SDimitry Andric   };
162581ad6265SDimitry Andric 
162681ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
162781ad6265SDimitry Andric   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
162881ad6265SDimitry Andric                             std::next(MI->getReverseIterator()), Visited))
162981ad6265SDimitry Andric     return false;
163081ad6265SDimitry Andric 
163181ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
163281ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
163381ad6265SDimitry Andric       .addImm(0x0fff);
163481ad6265SDimitry Andric 
163581ad6265SDimitry Andric   return true;
163681ad6265SDimitry Andric }
163781ad6265SDimitry Andric 
163881ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
163981ad6265SDimitry Andric   if (!ST.hasVALUTransUseHazard())
164081ad6265SDimitry Andric     return false;
16417a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
16427a6dacacSDimitry Andric 
164381ad6265SDimitry Andric   if (!SIInstrInfo::isVALU(*MI))
164481ad6265SDimitry Andric     return false;
164581ad6265SDimitry Andric 
164681ad6265SDimitry Andric   SmallSet<Register, 4> SrcVGPRs;
164781ad6265SDimitry Andric 
164881ad6265SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
164981ad6265SDimitry Andric     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
165081ad6265SDimitry Andric       SrcVGPRs.insert(Use.getReg());
165181ad6265SDimitry Andric   }
165281ad6265SDimitry Andric 
165381ad6265SDimitry Andric   // Look for the following pattern:
165481ad6265SDimitry Andric   //   Va <- TRANS VALU
165581ad6265SDimitry Andric   //   intv
165681ad6265SDimitry Andric   //   MI Va (WaitState = 0)
165781ad6265SDimitry Andric   //
165881ad6265SDimitry Andric   // Where:
165981ad6265SDimitry Andric   // intv <= 5 VALUs / 1 TRANS
166081ad6265SDimitry Andric   //
166181ad6265SDimitry Andric   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
166281ad6265SDimitry Andric 
166381ad6265SDimitry Andric   const int IntvMaxVALUs = 5;
166481ad6265SDimitry Andric   const int IntvMaxTRANS = 1;
166581ad6265SDimitry Andric 
166681ad6265SDimitry Andric   struct StateType {
166781ad6265SDimitry Andric     int VALUs = 0;
166881ad6265SDimitry Andric     int TRANS = 0;
166981ad6265SDimitry Andric   };
167081ad6265SDimitry Andric 
167181ad6265SDimitry Andric   StateType State;
167281ad6265SDimitry Andric 
167381ad6265SDimitry Andric   // This overloads expiry testing with all the hazard detection
167481ad6265SDimitry Andric   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
167581ad6265SDimitry Andric     // Too many VALU states have passed
167681ad6265SDimitry Andric     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
167781ad6265SDimitry Andric       return HazardExpired;
167881ad6265SDimitry Andric 
167981ad6265SDimitry Andric     // Instructions which cause va_vdst==0 expire hazard
168081ad6265SDimitry Andric     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
168181ad6265SDimitry Andric         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
168281ad6265SDimitry Andric         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
168381ad6265SDimitry Andric          I.getOperand(0).getImm() == 0x0fff))
168481ad6265SDimitry Andric       return HazardExpired;
168581ad6265SDimitry Andric 
168681ad6265SDimitry Andric     // Track registers writes
168781ad6265SDimitry Andric     if (SIInstrInfo::isTRANS(I)) {
168881ad6265SDimitry Andric       for (Register Src : SrcVGPRs) {
168981ad6265SDimitry Andric         if (I.modifiesRegister(Src, &TRI)) {
169081ad6265SDimitry Andric           return HazardFound;
169181ad6265SDimitry Andric         }
169281ad6265SDimitry Andric       }
169381ad6265SDimitry Andric     }
169481ad6265SDimitry Andric 
169581ad6265SDimitry Andric     return NoHazardFound;
169681ad6265SDimitry Andric   };
169781ad6265SDimitry Andric   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
169881ad6265SDimitry Andric     if (SIInstrInfo::isVALU(MI))
169981ad6265SDimitry Andric       State.VALUs += 1;
170081ad6265SDimitry Andric     if (SIInstrInfo::isTRANS(MI))
170181ad6265SDimitry Andric       State.TRANS += 1;
170281ad6265SDimitry Andric   };
170381ad6265SDimitry Andric 
170481ad6265SDimitry Andric   DenseSet<const MachineBasicBlock *> Visited;
170581ad6265SDimitry Andric   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
170681ad6265SDimitry Andric                             std::next(MI->getReverseIterator()), Visited))
170781ad6265SDimitry Andric     return false;
170881ad6265SDimitry Andric 
170981ad6265SDimitry Andric   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
171006c3fb27SDimitry Andric   // avoided.
171181ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
171281ad6265SDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
171306c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
171481ad6265SDimitry Andric 
171581ad6265SDimitry Andric   return true;
171681ad6265SDimitry Andric }
171781ad6265SDimitry Andric 
171881ad6265SDimitry Andric bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1719*b3edf446SDimitry Andric   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
172081ad6265SDimitry Andric     return false;
172181ad6265SDimitry Andric 
172281ad6265SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
172381ad6265SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
172481ad6265SDimitry Andric 
1725*b3edf446SDimitry Andric   auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1726*b3edf446SDimitry Andric     if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
172781ad6265SDimitry Andric       return false;
172881ad6265SDimitry Andric 
172981ad6265SDimitry Andric     // Src0 or Src1 of the current wmma instruction overlaps with the dest of
173081ad6265SDimitry Andric     // the previous wmma.
173181ad6265SDimitry Andric     const Register CurSrc0Reg =
173281ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
173381ad6265SDimitry Andric     const Register CurSrc1Reg =
173481ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
173581ad6265SDimitry Andric 
173681ad6265SDimitry Andric     const Register PrevDstReg =
173781ad6265SDimitry Andric         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
173881ad6265SDimitry Andric 
173981ad6265SDimitry Andric     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
174081ad6265SDimitry Andric         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
174181ad6265SDimitry Andric       return true;
174281ad6265SDimitry Andric     }
174381ad6265SDimitry Andric 
174481ad6265SDimitry Andric     // Src2 of the current wmma instruction overlaps with the dest of the
174581ad6265SDimitry Andric     // previous wmma.
174681ad6265SDimitry Andric     const MachineOperand *Src2 =
174781ad6265SDimitry Andric         TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
174881ad6265SDimitry Andric     const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
174981ad6265SDimitry Andric 
175081ad6265SDimitry Andric     if (CurSrc2Reg != AMDGPU::NoRegister &&
175181ad6265SDimitry Andric         TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
175281ad6265SDimitry Andric 
175381ad6265SDimitry Andric       const MachineOperand *Src2Mods =
175481ad6265SDimitry Andric           TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
175581ad6265SDimitry Andric       const bool NoSrc2Mods =
1756*b3edf446SDimitry Andric           !Src2Mods ||
175781ad6265SDimitry Andric           (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
175881ad6265SDimitry Andric       // Exception: there is no hazard if the wmma instructions are of the same
175981ad6265SDimitry Andric       // type and there is no input modifier on src2 of the current instruction.
176081ad6265SDimitry Andric       return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
176181ad6265SDimitry Andric                               TII->pseudoToMCOpcode(MI->getOpcode())));
176281ad6265SDimitry Andric     }
176381ad6265SDimitry Andric 
1764*b3edf446SDimitry Andric     // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1765*b3edf446SDimitry Andric     // but Index can't overlap with PrevDstReg.
1766*b3edf446SDimitry Andric     if (AMDGPU::isGFX12Plus(ST)) {
1767*b3edf446SDimitry Andric       if (SIInstrInfo::isSWMMAC(*MI)) {
1768*b3edf446SDimitry Andric         const Register CurIndex =
1769*b3edf446SDimitry Andric             TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1770*b3edf446SDimitry Andric         if (TRI->regsOverlap(PrevDstReg, CurIndex))
1771*b3edf446SDimitry Andric           return true;
1772*b3edf446SDimitry Andric       }
1773*b3edf446SDimitry Andric       return false;
1774*b3edf446SDimitry Andric     }
1775*b3edf446SDimitry Andric 
177681ad6265SDimitry Andric     return false;
177781ad6265SDimitry Andric   };
177881ad6265SDimitry Andric 
177981ad6265SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &I, int) {
178081ad6265SDimitry Andric     return SIInstrInfo::isVALU(I);
178181ad6265SDimitry Andric   };
178281ad6265SDimitry Andric 
178381ad6265SDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
178481ad6265SDimitry Andric       std::numeric_limits<int>::max())
178581ad6265SDimitry Andric     return false;
178681ad6265SDimitry Andric 
178781ad6265SDimitry Andric   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
178881ad6265SDimitry Andric 
178981ad6265SDimitry Andric   return true;
179081ad6265SDimitry Andric }
179181ad6265SDimitry Andric 
1792bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1793bdd1243dSDimitry Andric   if (!ST.hasShift64HighRegBug())
1794bdd1243dSDimitry Andric     return false;
17957a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
1796bdd1243dSDimitry Andric 
1797bdd1243dSDimitry Andric   switch (MI->getOpcode()) {
1798bdd1243dSDimitry Andric   default:
1799bdd1243dSDimitry Andric     return false;
1800bdd1243dSDimitry Andric   case AMDGPU::V_LSHLREV_B64_e64:
1801bdd1243dSDimitry Andric   case AMDGPU::V_LSHRREV_B64_e64:
1802bdd1243dSDimitry Andric   case AMDGPU::V_ASHRREV_I64_e64:
1803bdd1243dSDimitry Andric     break;
1804bdd1243dSDimitry Andric   }
1805bdd1243dSDimitry Andric 
1806bdd1243dSDimitry Andric   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1807bdd1243dSDimitry Andric   if (!Amt->isReg())
1808bdd1243dSDimitry Andric     return false;
1809bdd1243dSDimitry Andric 
1810bdd1243dSDimitry Andric   Register AmtReg = Amt->getReg();
1811bdd1243dSDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
1812bdd1243dSDimitry Andric   // Check if this is a last VGPR in the allocation block.
1813bdd1243dSDimitry Andric   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1814bdd1243dSDimitry Andric     return false;
1815bdd1243dSDimitry Andric 
1816bdd1243dSDimitry Andric   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1817bdd1243dSDimitry Andric     return false;
1818bdd1243dSDimitry Andric 
1819bdd1243dSDimitry Andric   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1820bdd1243dSDimitry Andric   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1821bdd1243dSDimitry Andric   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1822bdd1243dSDimitry Andric   bool Overlapped = OverlappedSrc || OverlappedDst;
1823bdd1243dSDimitry Andric 
1824bdd1243dSDimitry Andric   assert(!OverlappedDst || !OverlappedSrc ||
1825bdd1243dSDimitry Andric          Src1->getReg() == MI->getOperand(0).getReg());
1826bdd1243dSDimitry Andric   assert(ST.needsAlignedVGPRs());
1827bdd1243dSDimitry Andric   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1828bdd1243dSDimitry Andric 
1829bdd1243dSDimitry Andric   Register NewReg;
1830bdd1243dSDimitry Andric   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1831bdd1243dSDimitry Andric                                    : AMDGPU::VGPR_32RegClass) {
1832bdd1243dSDimitry Andric     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1833bdd1243dSDimitry Andric       NewReg = Reg;
1834bdd1243dSDimitry Andric       break;
1835bdd1243dSDimitry Andric     }
1836bdd1243dSDimitry Andric   }
1837bdd1243dSDimitry Andric 
1838bdd1243dSDimitry Andric   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1839bdd1243dSDimitry Andric                                : NewReg;
1840bdd1243dSDimitry Andric   Register NewAmtLo;
1841bdd1243dSDimitry Andric 
1842bdd1243dSDimitry Andric   if (Overlapped)
1843bdd1243dSDimitry Andric     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1844bdd1243dSDimitry Andric 
1845bdd1243dSDimitry Andric   DebugLoc DL = MI->getDebugLoc();
1846bdd1243dSDimitry Andric   MachineBasicBlock *MBB = MI->getParent();
1847bdd1243dSDimitry Andric   // Insert a full wait count because found register might be pending a wait.
1848bdd1243dSDimitry Andric   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1849bdd1243dSDimitry Andric       .addImm(0);
1850bdd1243dSDimitry Andric 
1851bdd1243dSDimitry Andric   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1852bdd1243dSDimitry Andric   if (Overlapped)
1853bdd1243dSDimitry Andric     runOnInstruction(
1854bdd1243dSDimitry Andric         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1855bdd1243dSDimitry Andric             .addDef(AmtReg - 1)
1856bdd1243dSDimitry Andric             .addReg(AmtReg - 1, RegState::Undef)
1857bdd1243dSDimitry Andric             .addReg(NewAmtLo, RegState::Undef));
1858bdd1243dSDimitry Andric   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1859bdd1243dSDimitry Andric                        .addDef(AmtReg)
1860bdd1243dSDimitry Andric                        .addReg(AmtReg, RegState::Undef)
1861bdd1243dSDimitry Andric                        .addReg(NewAmt, RegState::Undef));
1862bdd1243dSDimitry Andric 
1863bdd1243dSDimitry Andric   // Instructions emitted after the current instruction will be processed by the
1864bdd1243dSDimitry Andric   // parent loop of the hazard recognizer in a natural way.
1865bdd1243dSDimitry Andric   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1866bdd1243dSDimitry Andric           AmtReg)
1867bdd1243dSDimitry Andric       .addDef(NewAmt)
1868bdd1243dSDimitry Andric       .addReg(NewAmt)
1869bdd1243dSDimitry Andric       .addReg(AmtReg);
1870bdd1243dSDimitry Andric   if (Overlapped)
1871bdd1243dSDimitry Andric     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1872bdd1243dSDimitry Andric             AmtReg - 1)
1873bdd1243dSDimitry Andric         .addDef(NewAmtLo)
1874bdd1243dSDimitry Andric         .addReg(NewAmtLo)
1875bdd1243dSDimitry Andric         .addReg(AmtReg - 1);
1876bdd1243dSDimitry Andric 
1877bdd1243dSDimitry Andric   // Re-running hazard recognizer on the modified instruction is not necessary,
1878bdd1243dSDimitry Andric   // inserted V_SWAP_B32 has already both read and write new registers so
1879bdd1243dSDimitry Andric   // hazards related to these register has already been handled.
1880bdd1243dSDimitry Andric   Amt->setReg(NewAmt);
1881bdd1243dSDimitry Andric   Amt->setIsKill(false);
1882bdd1243dSDimitry Andric   // We do not update liveness, so verifier may see it as undef.
1883bdd1243dSDimitry Andric   Amt->setIsUndef();
1884bdd1243dSDimitry Andric   if (OverlappedDst)
1885bdd1243dSDimitry Andric     MI->getOperand(0).setReg(NewReg);
1886bdd1243dSDimitry Andric   if (OverlappedSrc) {
1887bdd1243dSDimitry Andric     Src1->setReg(NewReg);
1888bdd1243dSDimitry Andric     Src1->setIsKill(false);
1889bdd1243dSDimitry Andric     Src1->setIsUndef();
1890bdd1243dSDimitry Andric   }
1891bdd1243dSDimitry Andric 
1892bdd1243dSDimitry Andric   return true;
1893bdd1243dSDimitry Andric }
1894bdd1243dSDimitry Andric 
18950b57cec5SDimitry Andric int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
18960b57cec5SDimitry Andric   int NSAtoVMEMWaitStates = 1;
18970b57cec5SDimitry Andric 
18980b57cec5SDimitry Andric   if (!ST.hasNSAtoVMEMBug())
18990b57cec5SDimitry Andric     return 0;
19000b57cec5SDimitry Andric 
19010b57cec5SDimitry Andric   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
19020b57cec5SDimitry Andric     return 0;
19030b57cec5SDimitry Andric 
19040b57cec5SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
19050b57cec5SDimitry Andric   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
19060b57cec5SDimitry Andric   if (!Offset || (Offset->getImm() & 6) == 0)
19070b57cec5SDimitry Andric     return 0;
19080b57cec5SDimitry Andric 
1909fe6060f1SDimitry Andric   auto IsHazardFn = [TII](const MachineInstr &I) {
1910fe6060f1SDimitry Andric     if (!SIInstrInfo::isMIMG(I))
19110b57cec5SDimitry Andric       return false;
1912fe6060f1SDimitry Andric     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
19130b57cec5SDimitry Andric     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1914fe6060f1SDimitry Andric            TII->getInstSizeInBytes(I) >= 16;
19150b57cec5SDimitry Andric   };
19160b57cec5SDimitry Andric 
19170b57cec5SDimitry Andric   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
19180b57cec5SDimitry Andric }
19190b57cec5SDimitry Andric 
19200b57cec5SDimitry Andric int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
19210b57cec5SDimitry Andric   int FPAtomicToDenormModeWaitStates = 3;
19220b57cec5SDimitry Andric 
1923bdd1243dSDimitry Andric   if (!ST.hasFPAtomicToDenormModeHazard())
1924bdd1243dSDimitry Andric     return 0;
19257a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
1926bdd1243dSDimitry Andric 
19270b57cec5SDimitry Andric   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
19280b57cec5SDimitry Andric     return 0;
19290b57cec5SDimitry Andric 
1930fe6060f1SDimitry Andric   auto IsHazardFn = [](const MachineInstr &I) {
1931fe6060f1SDimitry Andric     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
19320b57cec5SDimitry Andric       return false;
1933fe6060f1SDimitry Andric     return SIInstrInfo::isFPAtomic(I);
19340b57cec5SDimitry Andric   };
19350b57cec5SDimitry Andric 
1936fe6060f1SDimitry Andric   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1937fe6060f1SDimitry Andric     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
19380b57cec5SDimitry Andric       return true;
19390b57cec5SDimitry Andric 
1940fe6060f1SDimitry Andric     switch (MI.getOpcode()) {
19410b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT:
19420b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_VSCNT:
19430b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_VMCNT:
19440b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_EXPCNT:
19450b57cec5SDimitry Andric     case AMDGPU::S_WAITCNT_LGKMCNT:
1946e8d8bef9SDimitry Andric     case AMDGPU::S_WAIT_IDLE:
19470b57cec5SDimitry Andric       return true;
19480b57cec5SDimitry Andric     default:
19490b57cec5SDimitry Andric       break;
19500b57cec5SDimitry Andric     }
19510b57cec5SDimitry Andric 
19520b57cec5SDimitry Andric     return false;
19530b57cec5SDimitry Andric   };
19540b57cec5SDimitry Andric 
19550b57cec5SDimitry Andric   return FPAtomicToDenormModeWaitStates -
19560b57cec5SDimitry Andric          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
19570b57cec5SDimitry Andric }
19580b57cec5SDimitry Andric 
19590b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
19600b57cec5SDimitry Andric   assert(SIInstrInfo::isMAI(*MI));
19610b57cec5SDimitry Andric 
1962fe6060f1SDimitry Andric   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1963fe6060f1SDimitry Andric }
1964fe6060f1SDimitry Andric 
196581ad6265SDimitry Andric int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
196681ad6265SDimitry Andric   // Early exit if no padding is requested.
196781ad6265SDimitry Andric   if (MFMAPaddingRatio == 0)
196881ad6265SDimitry Andric     return 0;
196981ad6265SDimitry Andric 
197081ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
197181ad6265SDimitry Andric   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
197281ad6265SDimitry Andric     return 0;
197381ad6265SDimitry Andric 
197481ad6265SDimitry Andric   int NeighborMFMALatency = 0;
197581ad6265SDimitry Andric   auto IsNeighboringMFMA = [&NeighborMFMALatency,
197681ad6265SDimitry Andric                             this](const MachineInstr &MI) {
197781ad6265SDimitry Andric     if (!SIInstrInfo::isMFMA(MI))
197881ad6265SDimitry Andric       return false;
197981ad6265SDimitry Andric 
198081ad6265SDimitry Andric     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
198181ad6265SDimitry Andric     return true;
198281ad6265SDimitry Andric   };
198381ad6265SDimitry Andric 
198481ad6265SDimitry Andric   const int MaxMFMAPipelineWaitStates = 16;
198581ad6265SDimitry Andric   int WaitStatesSinceNeighborMFMA =
198681ad6265SDimitry Andric       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
198781ad6265SDimitry Andric 
198881ad6265SDimitry Andric   int NeighborMFMAPaddingNeeded =
198981ad6265SDimitry Andric       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
199081ad6265SDimitry Andric       WaitStatesSinceNeighborMFMA;
199181ad6265SDimitry Andric 
199281ad6265SDimitry Andric   return std::max(0, NeighborMFMAPaddingNeeded);
199381ad6265SDimitry Andric }
199481ad6265SDimitry Andric 
1995fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
19960b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
19970b57cec5SDimitry Andric   unsigned Opc = MI->getOpcode();
19980b57cec5SDimitry Andric 
1999fe6060f1SDimitry Andric   auto IsVALUFn = [](const MachineInstr &MI) {
2000bdd1243dSDimitry Andric     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
20010b57cec5SDimitry Andric   };
20020b57cec5SDimitry Andric 
2003e8d8bef9SDimitry Andric   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
20040b57cec5SDimitry Andric     const int LegacyVALUWritesVGPRWaitStates = 2;
20050b57cec5SDimitry Andric     const int VALUWritesExecWaitStates = 4;
20060b57cec5SDimitry Andric     const int MaxWaitStates = 4;
20070b57cec5SDimitry Andric 
20080b57cec5SDimitry Andric     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
20090b57cec5SDimitry Andric       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
20100b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20110b57cec5SDimitry Andric 
20120b57cec5SDimitry Andric     if (WaitStatesNeeded < MaxWaitStates) {
20130b57cec5SDimitry Andric       for (const MachineOperand &Use : MI->explicit_uses()) {
20140b57cec5SDimitry Andric         const int MaxWaitStates = 2;
20150b57cec5SDimitry Andric 
20160b57cec5SDimitry Andric         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
20170b57cec5SDimitry Andric           continue;
20180b57cec5SDimitry Andric 
20190b57cec5SDimitry Andric         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
20200b57cec5SDimitry Andric           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
20210b57cec5SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20220b57cec5SDimitry Andric 
20230b57cec5SDimitry Andric         if (WaitStatesNeeded == MaxWaitStates)
20240b57cec5SDimitry Andric           break;
20250b57cec5SDimitry Andric       }
20260b57cec5SDimitry Andric     }
20270b57cec5SDimitry Andric   }
20280b57cec5SDimitry Andric 
20290b57cec5SDimitry Andric   for (const MachineOperand &Op : MI->explicit_operands()) {
20300b57cec5SDimitry Andric     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
20310b57cec5SDimitry Andric       continue;
20320b57cec5SDimitry Andric 
2033e8d8bef9SDimitry Andric     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
20340b57cec5SDimitry Andric       continue;
20350b57cec5SDimitry Andric 
20360b57cec5SDimitry Andric     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
20370b57cec5SDimitry Andric     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
20380b57cec5SDimitry Andric     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
20390b57cec5SDimitry Andric     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
20400b57cec5SDimitry Andric     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
20410b57cec5SDimitry Andric     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
20420b57cec5SDimitry Andric     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
20430b57cec5SDimitry Andric     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
20440b57cec5SDimitry Andric     const int MaxWaitStates = 18;
20458bcb0991SDimitry Andric     Register Reg = Op.getReg();
20460b57cec5SDimitry Andric     unsigned HazardDefLatency = 0;
20470b57cec5SDimitry Andric 
204881ad6265SDimitry Andric     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2049fe6060f1SDimitry Andric                                this](const MachineInstr &MI) {
205081ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
20510b57cec5SDimitry Andric         return false;
2052fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
20530b57cec5SDimitry Andric       if (DstReg == Reg)
20540b57cec5SDimitry Andric         return false;
2055fe6060f1SDimitry Andric       HazardDefLatency =
2056fe6060f1SDimitry Andric           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
20570b57cec5SDimitry Andric       return TRI.regsOverlap(DstReg, Reg);
20580b57cec5SDimitry Andric     };
20590b57cec5SDimitry Andric 
20600b57cec5SDimitry Andric     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
20610b57cec5SDimitry Andric                                                    MaxWaitStates);
20620b57cec5SDimitry Andric     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
20630b57cec5SDimitry Andric     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
206406c3fb27SDimitry Andric     int OpNo = Op.getOperandNo();
20650b57cec5SDimitry Andric     if (OpNo == SrcCIdx) {
20660b57cec5SDimitry Andric       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2067e8d8bef9SDimitry Andric     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
20680b57cec5SDimitry Andric       switch (HazardDefLatency) {
20690b57cec5SDimitry Andric       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
20700b57cec5SDimitry Andric                break;
20710b57cec5SDimitry Andric       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
20720b57cec5SDimitry Andric                break;
2073bdd1243dSDimitry Andric       case 16: [[fallthrough]];
20740b57cec5SDimitry Andric       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
20750b57cec5SDimitry Andric                break;
20760b57cec5SDimitry Andric       }
2077e8d8bef9SDimitry Andric     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
20780b57cec5SDimitry Andric       switch (HazardDefLatency) {
20790b57cec5SDimitry Andric       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
20800b57cec5SDimitry Andric                break;
20810b57cec5SDimitry Andric       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
20820b57cec5SDimitry Andric                break;
2083bdd1243dSDimitry Andric       case 16: [[fallthrough]];
20840b57cec5SDimitry Andric       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
20850b57cec5SDimitry Andric                break;
20860b57cec5SDimitry Andric       }
20870b57cec5SDimitry Andric     }
20880b57cec5SDimitry Andric 
20890b57cec5SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
20900b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20910b57cec5SDimitry Andric 
20920b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
20930b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
20940b57cec5SDimitry Andric 
2095fe6060f1SDimitry Andric     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2096fe6060f1SDimitry Andric       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
20970b57cec5SDimitry Andric         return false;
2098fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
20990b57cec5SDimitry Andric       return TRI.regsOverlap(Reg, DstReg);
21000b57cec5SDimitry Andric     };
21010b57cec5SDimitry Andric 
21020b57cec5SDimitry Andric     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
21030b57cec5SDimitry Andric     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
21040b57cec5SDimitry Andric     const int AccVGPRWriteAccVgprReadWaitStates = 3;
21050b57cec5SDimitry Andric     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
21060b57cec5SDimitry Andric     if (OpNo == SrcCIdx)
21070b57cec5SDimitry Andric       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2108e8d8bef9SDimitry Andric     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
21090b57cec5SDimitry Andric       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
21100b57cec5SDimitry Andric 
21110b57cec5SDimitry Andric     WaitStatesNeededForUse = NeedWaitStates -
21120b57cec5SDimitry Andric       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
21130b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
21140b57cec5SDimitry Andric 
21150b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
21160b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
21170b57cec5SDimitry Andric   }
21180b57cec5SDimitry Andric 
2119e8d8bef9SDimitry Andric   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
21200b57cec5SDimitry Andric     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
21210b57cec5SDimitry Andric     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
21220b57cec5SDimitry Andric     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
21230b57cec5SDimitry Andric     const int MaxWaitStates = 13;
21248bcb0991SDimitry Andric     Register DstReg = MI->getOperand(0).getReg();
21250b57cec5SDimitry Andric     unsigned HazardDefLatency = 0;
21260b57cec5SDimitry Andric 
212781ad6265SDimitry Andric     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2128fe6060f1SDimitry Andric                          this](const MachineInstr &MI) {
212981ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
21300b57cec5SDimitry Andric         return false;
2131fe6060f1SDimitry Andric       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2132fe6060f1SDimitry Andric       HazardDefLatency =
2133fe6060f1SDimitry Andric           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
21340b57cec5SDimitry Andric       return TRI.regsOverlap(Reg, DstReg);
21350b57cec5SDimitry Andric     };
21360b57cec5SDimitry Andric 
21370b57cec5SDimitry Andric     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
21380b57cec5SDimitry Andric     int NeedWaitStates;
21390b57cec5SDimitry Andric     switch (HazardDefLatency) {
21400b57cec5SDimitry Andric     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
21410b57cec5SDimitry Andric              break;
21420b57cec5SDimitry Andric     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
21430b57cec5SDimitry Andric              break;
2144bdd1243dSDimitry Andric     case 16: [[fallthrough]];
21450b57cec5SDimitry Andric     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
21460b57cec5SDimitry Andric              break;
21470b57cec5SDimitry Andric     }
21480b57cec5SDimitry Andric 
21490b57cec5SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
21500b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
21510b57cec5SDimitry Andric   }
21520b57cec5SDimitry Andric 
215381ad6265SDimitry Andric   // Pad neighboring MFMA with noops for better inter-wave performance.
215481ad6265SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
215581ad6265SDimitry Andric 
21560b57cec5SDimitry Andric   return WaitStatesNeeded;
21570b57cec5SDimitry Andric }
21580b57cec5SDimitry Andric 
2159fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2160fe6060f1SDimitry Andric   int WaitStatesNeeded = 0;
2161fe6060f1SDimitry Andric   unsigned Opc = MI->getOpcode();
2162fe6060f1SDimitry Andric 
216381ad6265SDimitry Andric   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
216481ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2165fe6060f1SDimitry Andric   };
2166fe6060f1SDimitry Andric 
216781ad6265SDimitry Andric   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
216881ad6265SDimitry Andric     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
216981ad6265SDimitry Andric            !SIInstrInfo::isDOT(MI);
2170fe6060f1SDimitry Andric   };
2171fe6060f1SDimitry Andric 
217281ad6265SDimitry Andric   if (!SIInstrInfo::isMFMA(*MI))
2173fe6060f1SDimitry Andric     return WaitStatesNeeded;
2174fe6060f1SDimitry Andric 
2175fe6060f1SDimitry Andric   const int VALUWritesExecWaitStates = 4;
2176fe6060f1SDimitry Andric   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2177fe6060f1SDimitry Andric     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2178fe6060f1SDimitry Andric                           VALUWritesExecWaitStates);
2179fe6060f1SDimitry Andric   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2180fe6060f1SDimitry Andric 
2181fe6060f1SDimitry Andric   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2182fe6060f1SDimitry Andric 
2183fe6060f1SDimitry Andric   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2184fe6060f1SDimitry Andric   for (const MachineOperand &Use : MI->explicit_uses()) {
2185fe6060f1SDimitry Andric     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2186fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
218781ad6265SDimitry Andric     const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
218881ad6265SDimitry Andric     const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
218981ad6265SDimitry Andric     const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
219081ad6265SDimitry Andric     const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
219181ad6265SDimitry Andric     const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
219281ad6265SDimitry Andric     const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
219381ad6265SDimitry Andric     const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2194fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2195fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2196fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2197fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2198fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2199fe6060f1SDimitry Andric     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2200fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2201fe6060f1SDimitry Andric     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2202fe6060f1SDimitry Andric     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2203fe6060f1SDimitry Andric     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
220481ad6265SDimitry Andric     const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
220581ad6265SDimitry Andric     const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
220681ad6265SDimitry Andric     const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
220781ad6265SDimitry Andric     const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
220881ad6265SDimitry Andric     const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
220981ad6265SDimitry Andric     const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
221081ad6265SDimitry Andric     const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
221181ad6265SDimitry Andric     const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2212fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2213fe6060f1SDimitry Andric     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2214fe6060f1SDimitry Andric     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
221581ad6265SDimitry Andric     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2216fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2217fe6060f1SDimitry Andric 
2218fe6060f1SDimitry Andric     if (!Use.isReg())
2219fe6060f1SDimitry Andric       continue;
222004eeddc0SDimitry Andric     Register Reg = Use.getReg();
2221fe6060f1SDimitry Andric     bool FullReg;
2222fe6060f1SDimitry Andric     const MachineInstr *MI1;
2223fe6060f1SDimitry Andric 
222481ad6265SDimitry Andric     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2225fe6060f1SDimitry Andric                                this](const MachineInstr &MI) {
222681ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI))
2227fe6060f1SDimitry Andric         return false;
2228fe6060f1SDimitry Andric       Register DstReg = MI.getOperand(0).getReg();
2229fe6060f1SDimitry Andric       FullReg = (DstReg == Reg);
2230fe6060f1SDimitry Andric       MI1 = &MI;
2231fe6060f1SDimitry Andric       return TRI.regsOverlap(DstReg, Reg);
2232fe6060f1SDimitry Andric     };
2233fe6060f1SDimitry Andric 
2234fe6060f1SDimitry Andric     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2235fe6060f1SDimitry Andric       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2236fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2237fe6060f1SDimitry Andric 
22384824e7fdSDimitry Andric     int NumWaitStates =
22394824e7fdSDimitry Andric         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2240fe6060f1SDimitry Andric     if (NumWaitStates == std::numeric_limits<int>::max())
2241fe6060f1SDimitry Andric       continue;
2242fe6060f1SDimitry Andric 
224306c3fb27SDimitry Andric     int OpNo = Use.getOperandNo();
2244fe6060f1SDimitry Andric     unsigned Opc1 = MI1->getOpcode();
2245fe6060f1SDimitry Andric     int NeedWaitStates = 0;
2246fe6060f1SDimitry Andric     if (OpNo == SrcCIdx) {
224781ad6265SDimitry Andric       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2248fe6060f1SDimitry Andric         NeedWaitStates = 0;
2249fe6060f1SDimitry Andric       } else if (FullReg) {
2250fe6060f1SDimitry Andric         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2251fe6060f1SDimitry Andric              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2252fe6060f1SDimitry Andric             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2253fe6060f1SDimitry Andric              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2254fe6060f1SDimitry Andric           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
225581ad6265SDimitry Andric         else if (ST.hasGFX940Insts() &&
225681ad6265SDimitry Andric                  TSchedModel.computeInstrLatency(MI1) == 2)
225781ad6265SDimitry Andric           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2258fe6060f1SDimitry Andric       } else {
2259fe6060f1SDimitry Andric         switch (Opc1) {
2260fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2261fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
226204eeddc0SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
226304eeddc0SDimitry Andric         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2264fe6060f1SDimitry Andric           if (!isXDL(ST, *MI))
2265fe6060f1SDimitry Andric             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2266fe6060f1SDimitry Andric           break;
2267fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2268fe6060f1SDimitry Andric         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2269fe6060f1SDimitry Andric           if (!isXDL(ST, *MI))
2270fe6060f1SDimitry Andric             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2271fe6060f1SDimitry Andric           break;
2272fe6060f1SDimitry Andric         default:
227381ad6265SDimitry Andric           if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
227481ad6265SDimitry Andric             break;
2275fe6060f1SDimitry Andric           switch (TSchedModel.computeInstrLatency(MI1)) {
2276fe6060f1SDimitry Andric           case 2:
227781ad6265SDimitry Andric             NeedWaitStates = ST.hasGFX940Insts()
227881ad6265SDimitry Andric               ? isXDL(ST, *MI1)
227981ad6265SDimitry Andric                 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
228081ad6265SDimitry Andric                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
228181ad6265SDimitry Andric               : isDGEMM(Opc)
2282fe6060f1SDimitry Andric                 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2283fe6060f1SDimitry Andric                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2284fe6060f1SDimitry Andric             break;
228581ad6265SDimitry Andric           case 4:
228681ad6265SDimitry Andric             assert(ST.hasGFX940Insts());
228781ad6265SDimitry Andric             NeedWaitStates = isXDL(ST, *MI1)
228881ad6265SDimitry Andric               ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
228981ad6265SDimitry Andric               : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
229081ad6265SDimitry Andric             break;
2291fe6060f1SDimitry Andric           case 8:
229281ad6265SDimitry Andric             NeedWaitStates = ST.hasGFX940Insts()
229381ad6265SDimitry Andric               ? isXDL(ST, *MI1)
229481ad6265SDimitry Andric                 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
229581ad6265SDimitry Andric                 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
229681ad6265SDimitry Andric               : isDGEMM(Opc)
2297fe6060f1SDimitry Andric                 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2298fe6060f1SDimitry Andric                 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2299fe6060f1SDimitry Andric             break;
2300bdd1243dSDimitry Andric           case 16: [[fallthrough]];
2301fe6060f1SDimitry Andric           default:
230281ad6265SDimitry Andric             NeedWaitStates = ST.hasGFX940Insts()
230381ad6265SDimitry Andric               ? isXDL(ST, *MI1)
230481ad6265SDimitry Andric                 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
230581ad6265SDimitry Andric                 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
230681ad6265SDimitry Andric               : isDGEMM(Opc)
2307fe6060f1SDimitry Andric                 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2308fe6060f1SDimitry Andric                 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2309fe6060f1SDimitry Andric           }
2310fe6060f1SDimitry Andric         }
2311fe6060f1SDimitry Andric       }
2312fe6060f1SDimitry Andric     } else {
2313fe6060f1SDimitry Andric       switch (Opc1) {
2314fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2315fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
231604eeddc0SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
231704eeddc0SDimitry Andric       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2318fe6060f1SDimitry Andric         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2319fe6060f1SDimitry Andric         break;
2320fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2321fe6060f1SDimitry Andric       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2322fe6060f1SDimitry Andric         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2323fe6060f1SDimitry Andric         break;
2324fe6060f1SDimitry Andric       default:
2325fe6060f1SDimitry Andric         switch (TSchedModel.computeInstrLatency(MI1)) {
2326fe6060f1SDimitry Andric         case 2:
232781ad6265SDimitry Andric           NeedWaitStates = ST.hasGFX940Insts()
232881ad6265SDimitry Andric             ? isXDL(ST, *MI1)
232981ad6265SDimitry Andric               ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
233081ad6265SDimitry Andric               : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
233181ad6265SDimitry Andric             : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
233281ad6265SDimitry Andric           break;
233381ad6265SDimitry Andric         case 4:
233481ad6265SDimitry Andric           assert(ST.hasGFX940Insts());
233581ad6265SDimitry Andric           NeedWaitStates = isXDL(ST, *MI1)
233681ad6265SDimitry Andric             ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
233781ad6265SDimitry Andric             : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2338fe6060f1SDimitry Andric           break;
2339fe6060f1SDimitry Andric         case 8:
234081ad6265SDimitry Andric           NeedWaitStates = ST.hasGFX940Insts()
234181ad6265SDimitry Andric             ? isXDL(ST, *MI1)
234281ad6265SDimitry Andric               ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
234381ad6265SDimitry Andric               : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
234481ad6265SDimitry Andric             : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2345fe6060f1SDimitry Andric           break;
2346bdd1243dSDimitry Andric         case 16: [[fallthrough]];
2347fe6060f1SDimitry Andric         default:
234881ad6265SDimitry Andric           NeedWaitStates = ST.hasGFX940Insts()
234981ad6265SDimitry Andric             ? isXDL(ST, *MI1)
235081ad6265SDimitry Andric               ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
235181ad6265SDimitry Andric               : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
235281ad6265SDimitry Andric             : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2353fe6060f1SDimitry Andric         }
2354fe6060f1SDimitry Andric       }
2355fe6060f1SDimitry Andric     }
2356fe6060f1SDimitry Andric     if (WaitStatesNeeded >= NeedWaitStates)
2357fe6060f1SDimitry Andric       continue;
2358fe6060f1SDimitry Andric 
2359fe6060f1SDimitry Andric     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2360fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2361fe6060f1SDimitry Andric 
2362fe6060f1SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
2363fe6060f1SDimitry Andric       break;
2364fe6060f1SDimitry Andric   }
2365fe6060f1SDimitry Andric 
2366fe6060f1SDimitry Andric   return WaitStatesNeeded;
2367fe6060f1SDimitry Andric }
2368fe6060f1SDimitry Andric 
23690b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2370349cc55cSDimitry Andric   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371fe6060f1SDimitry Andric   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
23720b57cec5SDimitry Andric     return 0;
23730b57cec5SDimitry Andric 
23740b57cec5SDimitry Andric   int WaitStatesNeeded = 0;
23750b57cec5SDimitry Andric 
2376fe6060f1SDimitry Andric   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2377fe6060f1SDimitry Andric     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
23780b57cec5SDimitry Andric   };
23790b57cec5SDimitry Andric 
23800b57cec5SDimitry Andric   for (const MachineOperand &Op : MI->explicit_uses()) {
23810b57cec5SDimitry Andric     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
23820b57cec5SDimitry Andric       continue;
23830b57cec5SDimitry Andric 
23848bcb0991SDimitry Andric     Register Reg = Op.getReg();
23850b57cec5SDimitry Andric 
23860b57cec5SDimitry Andric     const int AccVgprReadLdStWaitStates = 2;
2387e8d8bef9SDimitry Andric     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
23880b57cec5SDimitry Andric     const int MaxWaitStates = 2;
23890b57cec5SDimitry Andric 
23900b57cec5SDimitry Andric     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
23910b57cec5SDimitry Andric       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
23920b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
23930b57cec5SDimitry Andric 
23940b57cec5SDimitry Andric     if (WaitStatesNeeded == MaxWaitStates)
23950b57cec5SDimitry Andric       return WaitStatesNeeded; // Early exit.
23960b57cec5SDimitry Andric 
2397fe6060f1SDimitry Andric     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2398fe6060f1SDimitry Andric       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399fe6060f1SDimitry Andric           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
24000b57cec5SDimitry Andric         return false;
2401fe6060f1SDimitry Andric       auto IsVALUFn = [](const MachineInstr &MI) {
2402fe6060f1SDimitry Andric         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
24030b57cec5SDimitry Andric       };
24040b57cec5SDimitry Andric       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
24050b57cec5SDimitry Andric              std::numeric_limits<int>::max();
24060b57cec5SDimitry Andric     };
24070b57cec5SDimitry Andric 
2408e8d8bef9SDimitry Andric     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409e8d8bef9SDimitry Andric       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
24100b57cec5SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
24110b57cec5SDimitry Andric   }
24120b57cec5SDimitry Andric 
24130b57cec5SDimitry Andric   return WaitStatesNeeded;
24140b57cec5SDimitry Andric }
2415e8d8bef9SDimitry Andric 
2416fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2417fe6060f1SDimitry Andric   if (!ST.hasGFX90AInsts())
2418fe6060f1SDimitry Andric     return 0;
2419fe6060f1SDimitry Andric 
2420fe6060f1SDimitry Andric   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2421fe6060f1SDimitry Andric     return isDGEMM(MI.getOpcode());
2422fe6060f1SDimitry Andric   };
2423fe6060f1SDimitry Andric 
2424fe6060f1SDimitry Andric   // This is checked in checkMAIHazards90A()
242581ad6265SDimitry Andric   if (SIInstrInfo::isMFMA(*MI))
2426fe6060f1SDimitry Andric     return 0;
2427fe6060f1SDimitry Andric 
2428bdd1243dSDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
2429bdd1243dSDimitry Andric 
2430fe6060f1SDimitry Andric   int WaitStatesNeeded = 0;
2431fe6060f1SDimitry Andric 
2432bdd1243dSDimitry Andric   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2433fe6060f1SDimitry Andric                SIInstrInfo::isFLAT(*MI) ||
2434bdd1243dSDimitry Andric                SIInstrInfo::isDS(*MI);
2435bdd1243dSDimitry Andric   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2436fe6060f1SDimitry Andric   bool IsVALU = SIInstrInfo::isVALU(*MI);
2437fe6060f1SDimitry Andric 
2438fe6060f1SDimitry Andric   const MachineInstr *MFMA = nullptr;
2439fe6060f1SDimitry Andric   unsigned Reg;
244081ad6265SDimitry Andric   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
244181ad6265SDimitry Andric     if (!SIInstrInfo::isMFMA(MI) ||
244281ad6265SDimitry Andric         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2443fe6060f1SDimitry Andric       return false;
2444fe6060f1SDimitry Andric     MFMA = &MI;
2445fe6060f1SDimitry Andric     return true;
2446fe6060f1SDimitry Andric   };
2447fe6060f1SDimitry Andric 
2448fe6060f1SDimitry Andric   const MachineInstr *DOT = nullptr;
2449fe6060f1SDimitry Andric   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2450fe6060f1SDimitry Andric     if (!SIInstrInfo::isDOT(MI) ||
2451fe6060f1SDimitry Andric         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2452fe6060f1SDimitry Andric       return false;
2453fe6060f1SDimitry Andric     DOT = &MI;
2454fe6060f1SDimitry Andric     return true;
2455fe6060f1SDimitry Andric   };
2456fe6060f1SDimitry Andric 
2457bdd1243dSDimitry Andric   bool DGEMMAfterVALUWrite = false;
2458bdd1243dSDimitry Andric   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2459bdd1243dSDimitry Andric     // Found DGEMM on reverse traversal to def.
2460bdd1243dSDimitry Andric     if (isDGEMM(MI.getOpcode()))
2461bdd1243dSDimitry Andric       DGEMMAfterVALUWrite = true;
2462bdd1243dSDimitry Andric 
2463bdd1243dSDimitry Andric     // Only hazard if register is defined by a VALU and a DGEMM is found after
2464bdd1243dSDimitry Andric     // after the def.
2465bdd1243dSDimitry Andric     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2466bdd1243dSDimitry Andric       return false;
2467bdd1243dSDimitry Andric 
2468bdd1243dSDimitry Andric     return true;
2469bdd1243dSDimitry Andric   };
2470bdd1243dSDimitry Andric 
2471fe6060f1SDimitry Andric   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2472fe6060f1SDimitry Andric                                            AMDGPU::OpName::src2);
2473fe6060f1SDimitry Andric 
2474fe6060f1SDimitry Andric   if (IsMemOrExport || IsVALU) {
2475fe6060f1SDimitry Andric     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2476fe6060f1SDimitry Andric     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2477fe6060f1SDimitry Andric     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
247881ad6265SDimitry Andric     const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
247981ad6265SDimitry Andric     const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
248081ad6265SDimitry Andric     const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
248181ad6265SDimitry Andric     const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
248281ad6265SDimitry Andric     const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
248381ad6265SDimitry Andric     const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
248481ad6265SDimitry Andric     const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
248581ad6265SDimitry Andric     const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2486fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2487fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2488fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2489fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2490fe6060f1SDimitry Andric     const int DotWriteSameDotReadSrcAB = 3;
2491fe6060f1SDimitry Andric     const int DotWriteDifferentVALURead = 3;
2492bdd1243dSDimitry Andric     const int DMFMABetweenVALUWriteVMEMRead = 2;
2493fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2494fe6060f1SDimitry Andric 
2495fe6060f1SDimitry Andric     for (const MachineOperand &Use : MI->explicit_uses()) {
2496fe6060f1SDimitry Andric       if (!Use.isReg())
2497fe6060f1SDimitry Andric         continue;
2498fe6060f1SDimitry Andric       Reg = Use.getReg();
2499fe6060f1SDimitry Andric 
2500fe6060f1SDimitry Andric       DOT = nullptr;
2501fe6060f1SDimitry Andric       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2502fe6060f1SDimitry Andric                                                      MaxWaitStates);
2503fe6060f1SDimitry Andric       if (DOT) {
2504fe6060f1SDimitry Andric         int NeedWaitStates = 0;
2505fe6060f1SDimitry Andric         if (DOT->getOpcode() == MI->getOpcode()) {
2506fe6060f1SDimitry Andric           if (&Use - &MI->getOperand(0) != SrcCIdx)
2507fe6060f1SDimitry Andric             NeedWaitStates = DotWriteSameDotReadSrcAB;
2508fe6060f1SDimitry Andric         } else {
2509fe6060f1SDimitry Andric           NeedWaitStates = DotWriteDifferentVALURead;
2510fe6060f1SDimitry Andric         }
2511fe6060f1SDimitry Andric 
2512fe6060f1SDimitry Andric         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2513fe6060f1SDimitry Andric         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2514fe6060f1SDimitry Andric       }
2515fe6060f1SDimitry Andric 
2516bdd1243dSDimitry Andric       // Workaround for HW data hazard bug observed only in GFX90A. When there
2517bdd1243dSDimitry Andric       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2518bdd1243dSDimitry Andric       // causes the SQ to incorrectly not insert two wait states between the two
2519bdd1243dSDimitry Andric       // instructions needed to avoid data hazard.
2520bdd1243dSDimitry Andric       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2521bdd1243dSDimitry Andric         DGEMMAfterVALUWrite = false;
2522bdd1243dSDimitry Andric         if (TRI.isVectorRegister(MRI, Reg)) {
2523bdd1243dSDimitry Andric           int WaitStatesNeededForUse =
2524bdd1243dSDimitry Andric                 DMFMABetweenVALUWriteVMEMRead -
2525bdd1243dSDimitry Andric                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2526bdd1243dSDimitry Andric                                       DMFMABetweenVALUWriteVMEMRead);
2527bdd1243dSDimitry Andric 
2528bdd1243dSDimitry Andric           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2529bdd1243dSDimitry Andric         }
2530bdd1243dSDimitry Andric       }
2531bdd1243dSDimitry Andric 
2532fe6060f1SDimitry Andric       MFMA = nullptr;
25334824e7fdSDimitry Andric       WaitStatesSinceDef =
25344824e7fdSDimitry Andric           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2535fe6060f1SDimitry Andric       if (!MFMA)
2536fe6060f1SDimitry Andric         continue;
2537fe6060f1SDimitry Andric 
2538fe6060f1SDimitry Andric       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2539fe6060f1SDimitry Andric       int NeedWaitStates = MaxWaitStates;
2540fe6060f1SDimitry Andric       switch (HazardDefLatency) {
2541fe6060f1SDimitry Andric       case 2:
254281ad6265SDimitry Andric         NeedWaitStates =
254381ad6265SDimitry Andric           ST.hasGFX940Insts()
254481ad6265SDimitry Andric             ? isXDL(ST, *MFMA)
254581ad6265SDimitry Andric               ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
254681ad6265SDimitry Andric               : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
254781ad6265SDimitry Andric             : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2548fe6060f1SDimitry Andric         break;
2549fe6060f1SDimitry Andric       case 4:
255081ad6265SDimitry Andric         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2551fe6060f1SDimitry Andric         NeedWaitStates =
255281ad6265SDimitry Andric           isDGEMM(MFMA->getOpcode())
255381ad6265SDimitry Andric             ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
255481ad6265SDimitry Andric                             : DMFMA4x4WriteVgprVALUReadWaitStates
255581ad6265SDimitry Andric             : isXDL(ST, *MFMA)
255681ad6265SDimitry Andric               ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
255781ad6265SDimitry Andric               : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2558fe6060f1SDimitry Andric         break;
2559fe6060f1SDimitry Andric       case 8:
256081ad6265SDimitry Andric         NeedWaitStates =
256181ad6265SDimitry Andric           ST.hasGFX940Insts()
256281ad6265SDimitry Andric             ? isXDL(ST, *MFMA)
256381ad6265SDimitry Andric               ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
256481ad6265SDimitry Andric               : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
256581ad6265SDimitry Andric             : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2566fe6060f1SDimitry Andric         break;
2567bdd1243dSDimitry Andric       case 16: [[fallthrough]];
2568fe6060f1SDimitry Andric       default:
2569fe6060f1SDimitry Andric         NeedWaitStates =
2570fe6060f1SDimitry Andric           isDGEMM(MFMA->getOpcode())
2571fe6060f1SDimitry Andric             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2572fe6060f1SDimitry Andric                             : DMFMA16x16WriteVgprVALUReadWaitStates
257381ad6265SDimitry Andric             : ST.hasGFX940Insts()
257481ad6265SDimitry Andric               ? isXDL(ST, *MFMA)
257581ad6265SDimitry Andric                 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
257681ad6265SDimitry Andric                 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2577fe6060f1SDimitry Andric               : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2578fe6060f1SDimitry Andric         break;
2579fe6060f1SDimitry Andric       }
2580fe6060f1SDimitry Andric 
2581fe6060f1SDimitry Andric       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2582fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2583fe6060f1SDimitry Andric 
2584fe6060f1SDimitry Andric       if (WaitStatesNeeded == MaxWaitStates)
2585fe6060f1SDimitry Andric         break;
2586fe6060f1SDimitry Andric     }
2587fe6060f1SDimitry Andric   }
2588fe6060f1SDimitry Andric 
2589fe6060f1SDimitry Andric   unsigned Opc = MI->getOpcode();
2590fe6060f1SDimitry Andric   const int DMFMAToFMA64WaitStates = 2;
2591fe6060f1SDimitry Andric   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2592fe6060f1SDimitry Andric        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2593fe6060f1SDimitry Andric        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2594fe6060f1SDimitry Andric       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2595fe6060f1SDimitry Andric     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2596fe6060f1SDimitry Andric       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2597fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2598fe6060f1SDimitry Andric   }
2599fe6060f1SDimitry Andric 
2600fe6060f1SDimitry Andric   if (!IsVALU && !IsMemOrExport)
2601fe6060f1SDimitry Andric     return WaitStatesNeeded;
2602fe6060f1SDimitry Andric 
2603fe6060f1SDimitry Andric   for (const MachineOperand &Def : MI->defs()) {
2604fe6060f1SDimitry Andric     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2605fe6060f1SDimitry Andric     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2606fe6060f1SDimitry Andric     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
260781ad6265SDimitry Andric     const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
260881ad6265SDimitry Andric     const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
260981ad6265SDimitry Andric     const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
261081ad6265SDimitry Andric     const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
261181ad6265SDimitry Andric     const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
261281ad6265SDimitry Andric     const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
261381ad6265SDimitry Andric     const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
261481ad6265SDimitry Andric     const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2615fe6060f1SDimitry Andric     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
261681ad6265SDimitry Andric     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2617fe6060f1SDimitry Andric     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2618fe6060f1SDimitry Andric     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2619fe6060f1SDimitry Andric     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2620fe6060f1SDimitry Andric     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2621fe6060f1SDimitry Andric     const int DotWriteDifferentVALUWrite = 3;
2622fe6060f1SDimitry Andric     const int MaxWaitStates = 19;
2623fe6060f1SDimitry Andric     const int MaxWarWaitStates = 15;
2624fe6060f1SDimitry Andric 
2625fe6060f1SDimitry Andric     Reg = Def.getReg();
2626fe6060f1SDimitry Andric 
2627fe6060f1SDimitry Andric     DOT = nullptr;
2628fe6060f1SDimitry Andric     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2629fe6060f1SDimitry Andric                                                    MaxWaitStates);
2630fe6060f1SDimitry Andric     if (DOT && DOT->getOpcode() != MI->getOpcode())
2631fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2632fe6060f1SDimitry Andric                                                     WaitStatesSinceDef);
2633fe6060f1SDimitry Andric 
2634fe6060f1SDimitry Andric     MFMA = nullptr;
26354824e7fdSDimitry Andric     WaitStatesSinceDef =
26364824e7fdSDimitry Andric         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2637fe6060f1SDimitry Andric     if (MFMA) {
2638fe6060f1SDimitry Andric       int NeedWaitStates = MaxWaitStates;
2639fe6060f1SDimitry Andric       switch (TSchedModel.computeInstrLatency(MFMA)) {
2640fe6060f1SDimitry Andric       case 2:
264181ad6265SDimitry Andric         NeedWaitStates = ST.hasGFX940Insts()
264281ad6265SDimitry Andric           ? isXDL(ST, *MFMA)
264381ad6265SDimitry Andric             ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
264481ad6265SDimitry Andric             : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
264581ad6265SDimitry Andric           : SMFMA4x4WriteVgprVALUWawWaitStates;
2646fe6060f1SDimitry Andric         break;
2647fe6060f1SDimitry Andric       case 4:
264881ad6265SDimitry Andric         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
264981ad6265SDimitry Andric         NeedWaitStates = isDGEMM(MFMA->getOpcode())
265081ad6265SDimitry Andric             ? DMFMA4x4WriteVgprVALUWriteWaitStates
265181ad6265SDimitry Andric             : isXDL(ST, *MFMA)
265281ad6265SDimitry Andric               ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
265381ad6265SDimitry Andric               : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2654fe6060f1SDimitry Andric         break;
2655fe6060f1SDimitry Andric       case 8:
265681ad6265SDimitry Andric         NeedWaitStates = ST.hasGFX940Insts()
265781ad6265SDimitry Andric           ? isXDL(ST, *MFMA)
265881ad6265SDimitry Andric             ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
265981ad6265SDimitry Andric             : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
266081ad6265SDimitry Andric           : SMFMA16x16WriteVgprVALUWawWaitStates;
2661fe6060f1SDimitry Andric         break;
2662bdd1243dSDimitry Andric       case 16: [[fallthrough]];
2663fe6060f1SDimitry Andric       default:
2664fe6060f1SDimitry Andric         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2665fe6060f1SDimitry Andric                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
266681ad6265SDimitry Andric                    : ST.hasGFX940Insts()
266781ad6265SDimitry Andric                      ? isXDL(ST, *MFMA)
266881ad6265SDimitry Andric                        ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
266981ad6265SDimitry Andric                        : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2670fe6060f1SDimitry Andric                    : SMFMA32x32WriteVgprVALUWawWaitStates;
2671fe6060f1SDimitry Andric         break;
2672fe6060f1SDimitry Andric       }
2673fe6060f1SDimitry Andric 
2674fe6060f1SDimitry Andric       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2675fe6060f1SDimitry Andric       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2676fe6060f1SDimitry Andric 
2677fe6060f1SDimitry Andric       if (WaitStatesNeeded == MaxWaitStates)
2678fe6060f1SDimitry Andric         break;
2679fe6060f1SDimitry Andric     }
2680fe6060f1SDimitry Andric 
268181ad6265SDimitry Andric     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
268281ad6265SDimitry Andric       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2683fe6060f1SDimitry Andric           !MI.readsRegister(Reg, &TRI))
2684fe6060f1SDimitry Andric         return false;
2685fe6060f1SDimitry Andric 
268681ad6265SDimitry Andric       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
268781ad6265SDimitry Andric         return false;
268881ad6265SDimitry Andric 
2689fe6060f1SDimitry Andric       const MachineOperand *SrcC =
2690fe6060f1SDimitry Andric           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2691fe6060f1SDimitry Andric       assert(SrcC);
2692fe6060f1SDimitry Andric       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2693fe6060f1SDimitry Andric         return false;
2694fe6060f1SDimitry Andric 
2695fe6060f1SDimitry Andric       MFMA = &MI;
2696fe6060f1SDimitry Andric       return true;
2697fe6060f1SDimitry Andric     };
2698fe6060f1SDimitry Andric 
2699fe6060f1SDimitry Andric     MFMA = nullptr;
2700fe6060f1SDimitry Andric     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2701fe6060f1SDimitry Andric                                                 MaxWarWaitStates);
2702fe6060f1SDimitry Andric     if (!MFMA)
2703fe6060f1SDimitry Andric       continue;
2704fe6060f1SDimitry Andric 
2705fe6060f1SDimitry Andric     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2706fe6060f1SDimitry Andric     int NeedWaitStates = MaxWaitStates;
2707fe6060f1SDimitry Andric     switch (HazardDefLatency) {
2708fe6060f1SDimitry Andric     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2709fe6060f1SDimitry Andric              break;
271081ad6265SDimitry Andric     case 4:  assert(ST.hasGFX940Insts());
271181ad6265SDimitry Andric              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
271281ad6265SDimitry Andric              break;
2713fe6060f1SDimitry Andric     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2714fe6060f1SDimitry Andric              break;
2715bdd1243dSDimitry Andric     case 16: [[fallthrough]];
2716fe6060f1SDimitry Andric     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2717fe6060f1SDimitry Andric              break;
2718fe6060f1SDimitry Andric     }
2719fe6060f1SDimitry Andric 
2720fe6060f1SDimitry Andric     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2721fe6060f1SDimitry Andric     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2722fe6060f1SDimitry Andric   }
2723fe6060f1SDimitry Andric 
2724fe6060f1SDimitry Andric   return WaitStatesNeeded;
2725fe6060f1SDimitry Andric }
2726fe6060f1SDimitry Andric 
2727e8d8bef9SDimitry Andric bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2728e8d8bef9SDimitry Andric   if (!SU->isInstr())
2729e8d8bef9SDimitry Andric     return false;
2730e8d8bef9SDimitry Andric 
2731fe6060f1SDimitry Andric   const MachineInstr *MAI = nullptr;
273281ad6265SDimitry Andric 
2733fe6060f1SDimitry Andric   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2734e8d8bef9SDimitry Andric     MAI = nullptr;
273581ad6265SDimitry Andric     if (SIInstrInfo::isMFMA(MI))
2736fe6060f1SDimitry Andric       MAI = &MI;
2737e8d8bef9SDimitry Andric     return MAI != nullptr;
2738e8d8bef9SDimitry Andric   };
2739e8d8bef9SDimitry Andric 
2740e8d8bef9SDimitry Andric   MachineInstr *MI = SU->getInstr();
2741fe6060f1SDimitry Andric   if (IsMFMAFn(*MI)) {
2742e8d8bef9SDimitry Andric     int W = getWaitStatesSince(IsMFMAFn, 16);
2743e8d8bef9SDimitry Andric     if (MAI)
2744e8d8bef9SDimitry Andric       return W < (int)TSchedModel.computeInstrLatency(MAI);
2745e8d8bef9SDimitry Andric   }
2746e8d8bef9SDimitry Andric 
2747e8d8bef9SDimitry Andric   return false;
2748e8d8bef9SDimitry Andric }
2749bdd1243dSDimitry Andric 
2750bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2751bdd1243dSDimitry Andric   if (!ST.hasVALUMaskWriteHazard())
2752bdd1243dSDimitry Andric     return false;
27537a6dacacSDimitry Andric   assert(!ST.hasExtendedWaitCounts());
27547a6dacacSDimitry Andric 
27557a6dacacSDimitry Andric   if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2756bdd1243dSDimitry Andric     return false;
2757bdd1243dSDimitry Andric 
2758bdd1243dSDimitry Andric   // The hazard sequence is three instructions:
2759bdd1243dSDimitry Andric   //   1. VALU reads SGPR as mask
2760bdd1243dSDimitry Andric   //   2. SALU writes SGPR
2761bdd1243dSDimitry Andric   //   3. SALU reads SGPR
2762bdd1243dSDimitry Andric   // The hazard can expire if the distance between 2 and 3 is sufficient.
2763bdd1243dSDimitry Andric   // In practice this happens <10% of the time, hence this always assumes
2764bdd1243dSDimitry Andric   // the hazard exists if 1 and 2 are present to avoid searching.
2765bdd1243dSDimitry Andric 
2766bdd1243dSDimitry Andric   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2767bdd1243dSDimitry Andric   if (!SDSTOp || !SDSTOp->isReg())
2768bdd1243dSDimitry Andric     return false;
2769bdd1243dSDimitry Andric 
2770bdd1243dSDimitry Andric   const Register HazardReg = SDSTOp->getReg();
2771bdd1243dSDimitry Andric   if (HazardReg == AMDGPU::EXEC ||
2772bdd1243dSDimitry Andric       HazardReg == AMDGPU::EXEC_LO ||
2773bdd1243dSDimitry Andric       HazardReg == AMDGPU::EXEC_HI ||
2774bdd1243dSDimitry Andric       HazardReg == AMDGPU::M0)
2775bdd1243dSDimitry Andric     return false;
2776bdd1243dSDimitry Andric 
2777bdd1243dSDimitry Andric   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2778bdd1243dSDimitry Andric     switch (I.getOpcode()) {
2779bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_e32:
2780bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_dpp:
2781bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_e32:
2782bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_dpp:
2783bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_e32:
2784bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_dpp:
2785bdd1243dSDimitry Andric     case AMDGPU::V_DIV_FMAS_F32_e64:
2786bdd1243dSDimitry Andric     case AMDGPU::V_DIV_FMAS_F64_e64:
2787bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_e32:
2788bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_dpp:
2789bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_e32:
2790bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_dpp:
2791bdd1243dSDimitry Andric       // These implicitly read VCC as mask source.
2792bdd1243dSDimitry Andric       return HazardReg == AMDGPU::VCC ||
2793bdd1243dSDimitry Andric              HazardReg == AMDGPU::VCC_LO ||
2794bdd1243dSDimitry Andric              HazardReg == AMDGPU::VCC_HI;
2795bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_e64:
2796bdd1243dSDimitry Andric     case AMDGPU::V_ADDC_U32_e64_dpp:
2797bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_e64:
2798bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2799bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_e64:
2800bdd1243dSDimitry Andric     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2801bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_e64:
2802bdd1243dSDimitry Andric     case AMDGPU::V_SUBB_U32_e64_dpp:
2803bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_e64:
2804bdd1243dSDimitry Andric     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2805bdd1243dSDimitry Andric       // Only check mask register overlaps.
2806bdd1243dSDimitry Andric       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2807bdd1243dSDimitry Andric       assert(SSRCOp);
2808bdd1243dSDimitry Andric       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2809bdd1243dSDimitry Andric     }
2810bdd1243dSDimitry Andric     default:
2811bdd1243dSDimitry Andric       return false;
2812bdd1243dSDimitry Andric     }
2813bdd1243dSDimitry Andric   };
2814bdd1243dSDimitry Andric 
2815bdd1243dSDimitry Andric   const MachineRegisterInfo &MRI = MF.getRegInfo();
2816bdd1243dSDimitry Andric   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2817bdd1243dSDimitry Andric     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2818bdd1243dSDimitry Andric     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
281906c3fb27SDimitry Andric         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2820bdd1243dSDimitry Andric       return true;
2821bdd1243dSDimitry Andric 
2822bdd1243dSDimitry Andric     // VALU access to any SGPR or literal constant other than HazardReg
2823bdd1243dSDimitry Andric     // mitigates hazard. No need to check HazardReg here as this will
2824bdd1243dSDimitry Andric     // only be called when !IsHazardFn.
2825bdd1243dSDimitry Andric     if (!SIInstrInfo::isVALU(I))
2826bdd1243dSDimitry Andric       return false;
2827bdd1243dSDimitry Andric     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2828bdd1243dSDimitry Andric       const MachineOperand &Op = I.getOperand(OpNo);
2829bdd1243dSDimitry Andric       if (Op.isReg()) {
2830bdd1243dSDimitry Andric         Register OpReg = Op.getReg();
2831bdd1243dSDimitry Andric         // Only consider uses
2832bdd1243dSDimitry Andric         if (!Op.isUse())
2833bdd1243dSDimitry Andric           continue;
2834bdd1243dSDimitry Andric         // Ignore EXEC
2835bdd1243dSDimitry Andric         if (OpReg == AMDGPU::EXEC ||
2836bdd1243dSDimitry Andric             OpReg == AMDGPU::EXEC_LO ||
2837bdd1243dSDimitry Andric             OpReg == AMDGPU::EXEC_HI)
2838bdd1243dSDimitry Andric           continue;
2839bdd1243dSDimitry Andric         // Ignore all implicit uses except VCC
2840bdd1243dSDimitry Andric         if (Op.isImplicit()) {
2841bdd1243dSDimitry Andric           if (OpReg == AMDGPU::VCC ||
2842bdd1243dSDimitry Andric               OpReg == AMDGPU::VCC_LO ||
2843bdd1243dSDimitry Andric               OpReg == AMDGPU::VCC_HI)
2844bdd1243dSDimitry Andric             return true;
2845bdd1243dSDimitry Andric           continue;
2846bdd1243dSDimitry Andric         }
2847bdd1243dSDimitry Andric         if (TRI.isSGPRReg(MRI, OpReg))
2848bdd1243dSDimitry Andric           return true;
2849bdd1243dSDimitry Andric       } else {
2850bdd1243dSDimitry Andric         const MCInstrDesc &InstDesc = I.getDesc();
2851bdd1243dSDimitry Andric         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2852bdd1243dSDimitry Andric         if (!TII.isInlineConstant(Op, OpInfo))
2853bdd1243dSDimitry Andric           return true;
2854bdd1243dSDimitry Andric       }
2855bdd1243dSDimitry Andric     }
2856bdd1243dSDimitry Andric     return false;
2857bdd1243dSDimitry Andric   };
2858bdd1243dSDimitry Andric 
2859bdd1243dSDimitry Andric   // Check for hazard
2860bdd1243dSDimitry Andric   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2861bdd1243dSDimitry Andric       std::numeric_limits<int>::max())
2862bdd1243dSDimitry Andric     return false;
2863bdd1243dSDimitry Andric 
2864bdd1243dSDimitry Andric   auto NextMI = std::next(MI->getIterator());
2865bdd1243dSDimitry Andric 
2866bdd1243dSDimitry Andric   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2867bdd1243dSDimitry Andric   BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2868bdd1243dSDimitry Andric           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
286906c3fb27SDimitry Andric       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2870bdd1243dSDimitry Andric 
2871bdd1243dSDimitry Andric   // SALU write may be s_getpc in a bundle.
2872bdd1243dSDimitry Andric   if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2873bdd1243dSDimitry Andric     // Update offsets of any references in the bundle.
2874bdd1243dSDimitry Andric     while (NextMI != MI->getParent()->end() &&
2875bdd1243dSDimitry Andric            NextMI->isBundledWithPred()) {
2876bdd1243dSDimitry Andric       for (auto &Operand : NextMI->operands()) {
2877bdd1243dSDimitry Andric         if (Operand.isGlobal())
2878bdd1243dSDimitry Andric           Operand.setOffset(Operand.getOffset() + 4);
2879bdd1243dSDimitry Andric       }
2880bdd1243dSDimitry Andric       NextMI++;
2881bdd1243dSDimitry Andric     }
2882bdd1243dSDimitry Andric   }
2883bdd1243dSDimitry Andric 
2884bdd1243dSDimitry Andric   return true;
2885bdd1243dSDimitry Andric }
2886