10b57cec5SDimitry Andric //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements hazard recognizers for scheduling on GCN processors.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric
130b57cec5SDimitry Andric #include "GCNHazardRecognizer.h"
14e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
150b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1681ad6265SDimitry Andric #include "SIMachineFunctionInfo.h"
17*0fca6ea1SDimitry Andric #include "llvm/CodeGen/MachineFrameInfo.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
190b57cec5SDimitry Andric #include "llvm/CodeGen/ScheduleDAG.h"
2006c3fb27SDimitry Andric #include "llvm/TargetParser/TargetParser.h"
210b57cec5SDimitry Andric
220b57cec5SDimitry Andric using namespace llvm;
230b57cec5SDimitry Andric
2481ad6265SDimitry Andric namespace {
2581ad6265SDimitry Andric
2681ad6265SDimitry Andric struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
MFMAPaddingRatioParser__anon585a98fd0111::MFMAPaddingRatioParser2781ad6265SDimitry Andric MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
2881ad6265SDimitry Andric
parse__anon585a98fd0111::MFMAPaddingRatioParser2981ad6265SDimitry Andric bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
3081ad6265SDimitry Andric if (Arg.getAsInteger(0, Value))
3181ad6265SDimitry Andric return O.error("'" + Arg + "' value invalid for uint argument!");
3281ad6265SDimitry Andric
3381ad6265SDimitry Andric if (Value > 100)
3481ad6265SDimitry Andric return O.error("'" + Arg + "' value must be in the range [0, 100]!");
3581ad6265SDimitry Andric
3681ad6265SDimitry Andric return false;
3781ad6265SDimitry Andric }
3881ad6265SDimitry Andric };
3981ad6265SDimitry Andric
4081ad6265SDimitry Andric } // end anonymous namespace
4181ad6265SDimitry Andric
4281ad6265SDimitry Andric static cl::opt<unsigned, false, MFMAPaddingRatioParser>
4381ad6265SDimitry Andric MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
4481ad6265SDimitry Andric cl::desc("Fill a percentage of the latency between "
4581ad6265SDimitry Andric "neighboring MFMA with s_nops."));
4681ad6265SDimitry Andric
470b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4881ad6265SDimitry Andric // Hazard Recognizer Implementation
490b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
500b57cec5SDimitry Andric
51fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
52fe6060f1SDimitry Andric const GCNSubtarget &ST);
53fe6060f1SDimitry Andric
GCNHazardRecognizer(const MachineFunction & MF)540b57cec5SDimitry Andric GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
550b57cec5SDimitry Andric IsHazardRecognizerMode(false),
560b57cec5SDimitry Andric CurrCycleInstr(nullptr),
570b57cec5SDimitry Andric MF(MF),
580b57cec5SDimitry Andric ST(MF.getSubtarget<GCNSubtarget>()),
590b57cec5SDimitry Andric TII(*ST.getInstrInfo()),
600b57cec5SDimitry Andric TRI(TII.getRegisterInfo()),
610b57cec5SDimitry Andric ClauseUses(TRI.getNumRegUnits()),
620b57cec5SDimitry Andric ClauseDefs(TRI.getNumRegUnits()) {
63fe6060f1SDimitry Andric MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
640b57cec5SDimitry Andric TSchedModel.init(&ST);
65fe6060f1SDimitry Andric RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
660b57cec5SDimitry Andric }
670b57cec5SDimitry Andric
Reset()68e8d8bef9SDimitry Andric void GCNHazardRecognizer::Reset() {
69e8d8bef9SDimitry Andric EmittedInstrs.clear();
70e8d8bef9SDimitry Andric }
71e8d8bef9SDimitry Andric
EmitInstruction(SUnit * SU)720b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
730b57cec5SDimitry Andric EmitInstruction(SU->getInstr());
740b57cec5SDimitry Andric }
750b57cec5SDimitry Andric
EmitInstruction(MachineInstr * MI)760b57cec5SDimitry Andric void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
770b57cec5SDimitry Andric CurrCycleInstr = MI;
780b57cec5SDimitry Andric }
790b57cec5SDimitry Andric
isDivFMas(unsigned Opcode)800b57cec5SDimitry Andric static bool isDivFMas(unsigned Opcode) {
81e8d8bef9SDimitry Andric return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
820b57cec5SDimitry Andric }
830b57cec5SDimitry Andric
isSGetReg(unsigned Opcode)840b57cec5SDimitry Andric static bool isSGetReg(unsigned Opcode) {
850b57cec5SDimitry Andric return Opcode == AMDGPU::S_GETREG_B32;
860b57cec5SDimitry Andric }
870b57cec5SDimitry Andric
isSSetReg(unsigned Opcode)880b57cec5SDimitry Andric static bool isSSetReg(unsigned Opcode) {
89e8d8bef9SDimitry Andric switch (Opcode) {
90e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_B32:
91e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_B32_mode:
92e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_IMM32_B32:
93e8d8bef9SDimitry Andric case AMDGPU::S_SETREG_IMM32_B32_mode:
94e8d8bef9SDimitry Andric return true;
95e8d8bef9SDimitry Andric }
96e8d8bef9SDimitry Andric return false;
970b57cec5SDimitry Andric }
980b57cec5SDimitry Andric
isRWLane(unsigned Opcode)990b57cec5SDimitry Andric static bool isRWLane(unsigned Opcode) {
1000b57cec5SDimitry Andric return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
1010b57cec5SDimitry Andric }
1020b57cec5SDimitry Andric
isRFE(unsigned Opcode)1030b57cec5SDimitry Andric static bool isRFE(unsigned Opcode) {
1040b57cec5SDimitry Andric return Opcode == AMDGPU::S_RFE_B64;
1050b57cec5SDimitry Andric }
1060b57cec5SDimitry Andric
isSMovRel(unsigned Opcode)1070b57cec5SDimitry Andric static bool isSMovRel(unsigned Opcode) {
1080b57cec5SDimitry Andric switch (Opcode) {
1090b57cec5SDimitry Andric case AMDGPU::S_MOVRELS_B32:
1100b57cec5SDimitry Andric case AMDGPU::S_MOVRELS_B64:
1110b57cec5SDimitry Andric case AMDGPU::S_MOVRELD_B32:
1120b57cec5SDimitry Andric case AMDGPU::S_MOVRELD_B64:
1130b57cec5SDimitry Andric return true;
1140b57cec5SDimitry Andric default:
1150b57cec5SDimitry Andric return false;
1160b57cec5SDimitry Andric }
1170b57cec5SDimitry Andric }
1180b57cec5SDimitry Andric
isDGEMM(unsigned Opcode)119fe6060f1SDimitry Andric static bool isDGEMM(unsigned Opcode) {
12081ad6265SDimitry Andric return AMDGPU::getMAIIsDGEMM(Opcode);
121fe6060f1SDimitry Andric }
122fe6060f1SDimitry Andric
isXDL(const GCNSubtarget & ST,const MachineInstr & MI)123fe6060f1SDimitry Andric static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
124fe6060f1SDimitry Andric unsigned Opcode = MI.getOpcode();
125fe6060f1SDimitry Andric
126fe6060f1SDimitry Andric if (!SIInstrInfo::isMAI(MI) ||
127fe6060f1SDimitry Andric isDGEMM(Opcode) ||
128fe6060f1SDimitry Andric Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
129fe6060f1SDimitry Andric Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
130fe6060f1SDimitry Andric return false;
131fe6060f1SDimitry Andric
13281ad6265SDimitry Andric if (!ST.hasGFX940Insts())
133fe6060f1SDimitry Andric return true;
13481ad6265SDimitry Andric
13581ad6265SDimitry Andric return AMDGPU::getMAIIsGFX940XDL(Opcode);
136fe6060f1SDimitry Andric }
137fe6060f1SDimitry Andric
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)1380b57cec5SDimitry Andric static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
1390b57cec5SDimitry Andric const MachineInstr &MI) {
1400b57cec5SDimitry Andric if (TII.isAlwaysGDS(MI.getOpcode()))
1410b57cec5SDimitry Andric return true;
1420b57cec5SDimitry Andric
1430b57cec5SDimitry Andric switch (MI.getOpcode()) {
1440b57cec5SDimitry Andric case AMDGPU::S_SENDMSG:
1450b57cec5SDimitry Andric case AMDGPU::S_SENDMSGHALT:
1460b57cec5SDimitry Andric case AMDGPU::S_TTRACEDATA:
1470b57cec5SDimitry Andric return true;
1480b57cec5SDimitry Andric // These DS opcodes don't support GDS.
1490b57cec5SDimitry Andric case AMDGPU::DS_NOP:
1500b57cec5SDimitry Andric case AMDGPU::DS_PERMUTE_B32:
1510b57cec5SDimitry Andric case AMDGPU::DS_BPERMUTE_B32:
1520b57cec5SDimitry Andric return false;
1530b57cec5SDimitry Andric default:
1540b57cec5SDimitry Andric if (TII.isDS(MI.getOpcode())) {
1550b57cec5SDimitry Andric int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1560b57cec5SDimitry Andric AMDGPU::OpName::gds);
1570b57cec5SDimitry Andric if (MI.getOperand(GDS).getImm())
1580b57cec5SDimitry Andric return true;
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric return false;
1610b57cec5SDimitry Andric }
1620b57cec5SDimitry Andric }
1630b57cec5SDimitry Andric
isPermlane(const MachineInstr & MI)1640b57cec5SDimitry Andric static bool isPermlane(const MachineInstr &MI) {
1650b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode();
166e8d8bef9SDimitry Andric return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
1677a6dacacSDimitry Andric Opcode == AMDGPU::V_PERMLANE64_B32 ||
1685f757f3fSDimitry Andric Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
1695f757f3fSDimitry Andric Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
1705f757f3fSDimitry Andric Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
1710b57cec5SDimitry Andric }
1720b57cec5SDimitry Andric
isLdsDma(const MachineInstr & MI)17381ad6265SDimitry Andric static bool isLdsDma(const MachineInstr &MI) {
17481ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) &&
17581ad6265SDimitry Andric (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
17681ad6265SDimitry Andric }
17781ad6265SDimitry Andric
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)1780b57cec5SDimitry Andric static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
1790b57cec5SDimitry Andric const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
1800b57cec5SDimitry Andric AMDGPU::OpName::simm16);
181*0fca6ea1SDimitry Andric return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
1820b57cec5SDimitry Andric }
1830b57cec5SDimitry Andric
1840b57cec5SDimitry Andric ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)1850b57cec5SDimitry Andric GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
1860b57cec5SDimitry Andric MachineInstr *MI = SU->getInstr();
187e8d8bef9SDimitry Andric // If we are not in "HazardRecognizerMode" and therefore not being run from
188e8d8bef9SDimitry Andric // the scheduler, track possible stalls from hazards but don't insert noops.
189e8d8bef9SDimitry Andric auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
190e8d8bef9SDimitry Andric
1910b57cec5SDimitry Andric if (MI->isBundle())
1920b57cec5SDimitry Andric return NoHazard;
1930b57cec5SDimitry Andric
1940b57cec5SDimitry Andric if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
195e8d8bef9SDimitry Andric return HazardType;
1960b57cec5SDimitry Andric
1970b57cec5SDimitry Andric if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
198e8d8bef9SDimitry Andric return HazardType;
1990b57cec5SDimitry Andric
2000b57cec5SDimitry Andric if (checkFPAtomicToDenormModeHazard(MI) > 0)
201e8d8bef9SDimitry Andric return HazardType;
2020b57cec5SDimitry Andric
2030b57cec5SDimitry Andric if (ST.hasNoDataDepHazard())
2040b57cec5SDimitry Andric return NoHazard;
2050b57cec5SDimitry Andric
206fe6060f1SDimitry Andric // FIXME: Should flat be considered vmem?
207fe6060f1SDimitry Andric if ((SIInstrInfo::isVMEM(*MI) ||
208fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI))
209fe6060f1SDimitry Andric && checkVMEMHazards(MI) > 0)
210fe6060f1SDimitry Andric return HazardType;
211fe6060f1SDimitry Andric
2120b57cec5SDimitry Andric if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
213e8d8bef9SDimitry Andric return HazardType;
2140b57cec5SDimitry Andric
2150b57cec5SDimitry Andric if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
216e8d8bef9SDimitry Andric return HazardType;
2170b57cec5SDimitry Andric
2180b57cec5SDimitry Andric if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
219e8d8bef9SDimitry Andric return HazardType;
2200b57cec5SDimitry Andric
2210b57cec5SDimitry Andric if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
222e8d8bef9SDimitry Andric return HazardType;
2230b57cec5SDimitry Andric
224fe6060f1SDimitry Andric if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
225fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
226fe6060f1SDimitry Andric SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
227fe6060f1SDimitry Andric return HazardType;
228fe6060f1SDimitry Andric
2290b57cec5SDimitry Andric if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
230e8d8bef9SDimitry Andric return HazardType;
2310b57cec5SDimitry Andric
2320b57cec5SDimitry Andric if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
233e8d8bef9SDimitry Andric return HazardType;
2340b57cec5SDimitry Andric
2350b57cec5SDimitry Andric if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
236e8d8bef9SDimitry Andric return HazardType;
2370b57cec5SDimitry Andric
23881ad6265SDimitry Andric if (((ST.hasReadM0MovRelInterpHazard() &&
239bdd1243dSDimitry Andric (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
240bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
241bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
24281ad6265SDimitry Andric (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
24381ad6265SDimitry Andric (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
24481ad6265SDimitry Andric (ST.hasReadM0LdsDirectHazard() &&
245*0fca6ea1SDimitry Andric MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
2460b57cec5SDimitry Andric checkReadM0Hazards(MI) > 0)
247e8d8bef9SDimitry Andric return HazardType;
2480b57cec5SDimitry Andric
2490b57cec5SDimitry Andric if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
250e8d8bef9SDimitry Andric return HazardType;
2510b57cec5SDimitry Andric
252e8d8bef9SDimitry Andric if ((SIInstrInfo::isVMEM(*MI) ||
253e8d8bef9SDimitry Andric SIInstrInfo::isFLAT(*MI) ||
254e8d8bef9SDimitry Andric SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
255e8d8bef9SDimitry Andric return HazardType;
2560b57cec5SDimitry Andric
2570b57cec5SDimitry Andric if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
258e8d8bef9SDimitry Andric return HazardType;
2590b57cec5SDimitry Andric
2600b57cec5SDimitry Andric return NoHazard;
2610b57cec5SDimitry Andric }
2620b57cec5SDimitry Andric
insertNoopsInBundle(MachineInstr * MI,const SIInstrInfo & TII,unsigned Quantity)263e8d8bef9SDimitry Andric static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
264e8d8bef9SDimitry Andric unsigned Quantity) {
265e8d8bef9SDimitry Andric while (Quantity > 0) {
266e8d8bef9SDimitry Andric unsigned Arg = std::min(Quantity, 8u);
267e8d8bef9SDimitry Andric Quantity -= Arg;
2680b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
269e8d8bef9SDimitry Andric .addImm(Arg - 1);
270e8d8bef9SDimitry Andric }
2710b57cec5SDimitry Andric }
2720b57cec5SDimitry Andric
27381ad6265SDimitry Andric unsigned
getMFMAPipelineWaitStates(const MachineInstr & MI) const27481ad6265SDimitry Andric GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
27581ad6265SDimitry Andric const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
27681ad6265SDimitry Andric assert(TSchedModel.getWriteProcResBegin(SC) !=
27781ad6265SDimitry Andric TSchedModel.getWriteProcResEnd(SC));
2785f757f3fSDimitry Andric return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
27981ad6265SDimitry Andric }
28081ad6265SDimitry Andric
processBundle()2810b57cec5SDimitry Andric void GCNHazardRecognizer::processBundle() {
2820b57cec5SDimitry Andric MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
2830b57cec5SDimitry Andric MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
2840b57cec5SDimitry Andric // Check bundled MachineInstr's for hazards.
2850b57cec5SDimitry Andric for (; MI != E && MI->isInsideBundle(); ++MI) {
2860b57cec5SDimitry Andric CurrCycleInstr = &*MI;
2870b57cec5SDimitry Andric unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
2880b57cec5SDimitry Andric
289e8d8bef9SDimitry Andric if (IsHazardRecognizerMode) {
2900b57cec5SDimitry Andric fixHazards(CurrCycleInstr);
2910b57cec5SDimitry Andric
292e8d8bef9SDimitry Andric insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
293e8d8bef9SDimitry Andric }
2940b57cec5SDimitry Andric
2950b57cec5SDimitry Andric // It’s unnecessary to track more than MaxLookAhead instructions. Since we
2960b57cec5SDimitry Andric // include the bundled MI directly after, only add a maximum of
2970b57cec5SDimitry Andric // (MaxLookAhead - 1) noops to EmittedInstrs.
2980b57cec5SDimitry Andric for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
2990b57cec5SDimitry Andric EmittedInstrs.push_front(nullptr);
3000b57cec5SDimitry Andric
3010b57cec5SDimitry Andric EmittedInstrs.push_front(CurrCycleInstr);
3020b57cec5SDimitry Andric EmittedInstrs.resize(MaxLookAhead);
3030b57cec5SDimitry Andric }
3040b57cec5SDimitry Andric CurrCycleInstr = nullptr;
3050b57cec5SDimitry Andric }
3060b57cec5SDimitry Andric
runOnInstruction(MachineInstr * MI)307bdd1243dSDimitry Andric void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
308bdd1243dSDimitry Andric assert(IsHazardRecognizerMode);
309bdd1243dSDimitry Andric
310bdd1243dSDimitry Andric unsigned NumPreNoops = PreEmitNoops(MI);
311bdd1243dSDimitry Andric EmitNoops(NumPreNoops);
312bdd1243dSDimitry Andric if (MI->isInsideBundle())
313bdd1243dSDimitry Andric insertNoopsInBundle(MI, TII, NumPreNoops);
314bdd1243dSDimitry Andric else
315bdd1243dSDimitry Andric TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
316bdd1243dSDimitry Andric NumPreNoops);
317bdd1243dSDimitry Andric EmitInstruction(MI);
318bdd1243dSDimitry Andric AdvanceCycle();
319bdd1243dSDimitry Andric }
320bdd1243dSDimitry Andric
PreEmitNoops(MachineInstr * MI)3210b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
3220b57cec5SDimitry Andric IsHazardRecognizerMode = true;
3230b57cec5SDimitry Andric CurrCycleInstr = MI;
3240b57cec5SDimitry Andric unsigned W = PreEmitNoopsCommon(MI);
3250b57cec5SDimitry Andric fixHazards(MI);
3260b57cec5SDimitry Andric CurrCycleInstr = nullptr;
3270b57cec5SDimitry Andric return W;
3280b57cec5SDimitry Andric }
3290b57cec5SDimitry Andric
PreEmitNoopsCommon(MachineInstr * MI)3300b57cec5SDimitry Andric unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
3310b57cec5SDimitry Andric if (MI->isBundle())
3320b57cec5SDimitry Andric return 0;
3330b57cec5SDimitry Andric
334e8d8bef9SDimitry Andric int WaitStates = 0;
3350b57cec5SDimitry Andric
3360b57cec5SDimitry Andric if (SIInstrInfo::isSMRD(*MI))
3370b57cec5SDimitry Andric return std::max(WaitStates, checkSMRDHazards(MI));
3380b57cec5SDimitry Andric
3390b57cec5SDimitry Andric if (ST.hasNSAtoVMEMBug())
3400b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
3410b57cec5SDimitry Andric
3420b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
3430b57cec5SDimitry Andric
3440b57cec5SDimitry Andric if (ST.hasNoDataDepHazard())
3450b57cec5SDimitry Andric return WaitStates;
3460b57cec5SDimitry Andric
347fe6060f1SDimitry Andric if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
348fe6060f1SDimitry Andric WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
349fe6060f1SDimitry Andric
3500b57cec5SDimitry Andric if (SIInstrInfo::isVALU(*MI))
3510b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkVALUHazards(MI));
3520b57cec5SDimitry Andric
3530b57cec5SDimitry Andric if (SIInstrInfo::isDPP(*MI))
3540b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkDPPHazards(MI));
3550b57cec5SDimitry Andric
3560b57cec5SDimitry Andric if (isDivFMas(MI->getOpcode()))
3570b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
3580b57cec5SDimitry Andric
3590b57cec5SDimitry Andric if (isRWLane(MI->getOpcode()))
3600b57cec5SDimitry Andric WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
3610b57cec5SDimitry Andric
362fe6060f1SDimitry Andric if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
363fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
364fe6060f1SDimitry Andric SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
365fe6060f1SDimitry Andric WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
366fe6060f1SDimitry Andric
3670b57cec5SDimitry Andric if (MI->isInlineAsm())
3680b57cec5SDimitry Andric return std::max(WaitStates, checkInlineAsmHazards(MI));
3690b57cec5SDimitry Andric
3700b57cec5SDimitry Andric if (isSGetReg(MI->getOpcode()))
3710b57cec5SDimitry Andric return std::max(WaitStates, checkGetRegHazards(MI));
3720b57cec5SDimitry Andric
3730b57cec5SDimitry Andric if (isSSetReg(MI->getOpcode()))
3740b57cec5SDimitry Andric return std::max(WaitStates, checkSetRegHazards(MI));
3750b57cec5SDimitry Andric
3760b57cec5SDimitry Andric if (isRFE(MI->getOpcode()))
3770b57cec5SDimitry Andric return std::max(WaitStates, checkRFEHazards(MI));
3780b57cec5SDimitry Andric
37981ad6265SDimitry Andric if ((ST.hasReadM0MovRelInterpHazard() &&
380bdd1243dSDimitry Andric (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
381bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
382bdd1243dSDimitry Andric MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
38381ad6265SDimitry Andric (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
38481ad6265SDimitry Andric (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
385*0fca6ea1SDimitry Andric (ST.hasReadM0LdsDirectHazard() &&
386*0fca6ea1SDimitry Andric MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
3870b57cec5SDimitry Andric return std::max(WaitStates, checkReadM0Hazards(MI));
3880b57cec5SDimitry Andric
3890b57cec5SDimitry Andric if (SIInstrInfo::isMAI(*MI))
3900b57cec5SDimitry Andric return std::max(WaitStates, checkMAIHazards(MI));
3910b57cec5SDimitry Andric
392e8d8bef9SDimitry Andric if (SIInstrInfo::isVMEM(*MI) ||
393e8d8bef9SDimitry Andric SIInstrInfo::isFLAT(*MI) ||
394e8d8bef9SDimitry Andric SIInstrInfo::isDS(*MI))
3950b57cec5SDimitry Andric return std::max(WaitStates, checkMAILdStHazards(MI));
3960b57cec5SDimitry Andric
3970b57cec5SDimitry Andric return WaitStates;
3980b57cec5SDimitry Andric }
3990b57cec5SDimitry Andric
EmitNoop()4000b57cec5SDimitry Andric void GCNHazardRecognizer::EmitNoop() {
4010b57cec5SDimitry Andric EmittedInstrs.push_front(nullptr);
4020b57cec5SDimitry Andric }
4030b57cec5SDimitry Andric
AdvanceCycle()4040b57cec5SDimitry Andric void GCNHazardRecognizer::AdvanceCycle() {
4050b57cec5SDimitry Andric // When the scheduler detects a stall, it will call AdvanceCycle() without
4060b57cec5SDimitry Andric // emitting any instructions.
407e8d8bef9SDimitry Andric if (!CurrCycleInstr) {
408e8d8bef9SDimitry Andric EmittedInstrs.push_front(nullptr);
4090b57cec5SDimitry Andric return;
410e8d8bef9SDimitry Andric }
4110b57cec5SDimitry Andric
4120b57cec5SDimitry Andric if (CurrCycleInstr->isBundle()) {
4130b57cec5SDimitry Andric processBundle();
4140b57cec5SDimitry Andric return;
4150b57cec5SDimitry Andric }
4160b57cec5SDimitry Andric
4170b57cec5SDimitry Andric unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
418349cc55cSDimitry Andric if (!NumWaitStates) {
419349cc55cSDimitry Andric CurrCycleInstr = nullptr;
420349cc55cSDimitry Andric return;
421349cc55cSDimitry Andric }
4220b57cec5SDimitry Andric
4230b57cec5SDimitry Andric // Keep track of emitted instructions
4240b57cec5SDimitry Andric EmittedInstrs.push_front(CurrCycleInstr);
4250b57cec5SDimitry Andric
4260b57cec5SDimitry Andric // Add a nullptr for each additional wait state after the first. Make sure
4270b57cec5SDimitry Andric // not to add more than getMaxLookAhead() items to the list, since we
4280b57cec5SDimitry Andric // truncate the list to that size right after this loop.
4290b57cec5SDimitry Andric for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
4300b57cec5SDimitry Andric i < e; ++i) {
4310b57cec5SDimitry Andric EmittedInstrs.push_front(nullptr);
4320b57cec5SDimitry Andric }
4330b57cec5SDimitry Andric
4340b57cec5SDimitry Andric // getMaxLookahead() is the largest number of wait states we will ever need
4350b57cec5SDimitry Andric // to insert, so there is no point in keeping track of more than that many
4360b57cec5SDimitry Andric // wait states.
4370b57cec5SDimitry Andric EmittedInstrs.resize(getMaxLookAhead());
4380b57cec5SDimitry Andric
4390b57cec5SDimitry Andric CurrCycleInstr = nullptr;
4400b57cec5SDimitry Andric }
4410b57cec5SDimitry Andric
RecedeCycle()4420b57cec5SDimitry Andric void GCNHazardRecognizer::RecedeCycle() {
4430b57cec5SDimitry Andric llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
4440b57cec5SDimitry Andric }
4450b57cec5SDimitry Andric
4460b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4470b57cec5SDimitry Andric // Helper Functions
4480b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
4490b57cec5SDimitry Andric
450*0fca6ea1SDimitry Andric using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
45181ad6265SDimitry Andric
452*0fca6ea1SDimitry Andric using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
453*0fca6ea1SDimitry Andric using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
45481ad6265SDimitry Andric
45581ad6265SDimitry Andric // Search for a hazard in a block and its predecessors.
45681ad6265SDimitry Andric template <typename StateT>
45781ad6265SDimitry Andric static bool
hasHazard(StateT State,function_ref<HazardFnResult (StateT &,const MachineInstr &)> IsHazard,function_ref<void (StateT &,const MachineInstr &)> UpdateState,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,DenseSet<const MachineBasicBlock * > & Visited)45881ad6265SDimitry Andric hasHazard(StateT State,
45981ad6265SDimitry Andric function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
46081ad6265SDimitry Andric function_ref<void(StateT &, const MachineInstr &)> UpdateState,
46181ad6265SDimitry Andric const MachineBasicBlock *MBB,
46281ad6265SDimitry Andric MachineBasicBlock::const_reverse_instr_iterator I,
46381ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> &Visited) {
46481ad6265SDimitry Andric for (auto E = MBB->instr_rend(); I != E; ++I) {
46581ad6265SDimitry Andric // No need to look at parent BUNDLE instructions.
46681ad6265SDimitry Andric if (I->isBundle())
46781ad6265SDimitry Andric continue;
46881ad6265SDimitry Andric
46981ad6265SDimitry Andric switch (IsHazard(State, *I)) {
47081ad6265SDimitry Andric case HazardFound:
47181ad6265SDimitry Andric return true;
47281ad6265SDimitry Andric case HazardExpired:
47381ad6265SDimitry Andric return false;
47481ad6265SDimitry Andric default:
47581ad6265SDimitry Andric // Continue search
47681ad6265SDimitry Andric break;
47781ad6265SDimitry Andric }
47881ad6265SDimitry Andric
47981ad6265SDimitry Andric if (I->isInlineAsm() || I->isMetaInstruction())
48081ad6265SDimitry Andric continue;
48181ad6265SDimitry Andric
48281ad6265SDimitry Andric UpdateState(State, *I);
48381ad6265SDimitry Andric }
48481ad6265SDimitry Andric
48581ad6265SDimitry Andric for (MachineBasicBlock *Pred : MBB->predecessors()) {
48681ad6265SDimitry Andric if (!Visited.insert(Pred).second)
48781ad6265SDimitry Andric continue;
48881ad6265SDimitry Andric
48981ad6265SDimitry Andric if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
49081ad6265SDimitry Andric Visited))
49181ad6265SDimitry Andric return true;
49281ad6265SDimitry Andric }
49381ad6265SDimitry Andric
49481ad6265SDimitry Andric return false;
49581ad6265SDimitry Andric }
4960b57cec5SDimitry Andric
4970b57cec5SDimitry Andric // Returns a minimum wait states since \p I walking all predecessors.
4980b57cec5SDimitry Andric // Only scans until \p IsExpired does not return true.
4990b57cec5SDimitry Andric // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited,GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)50081ad6265SDimitry Andric static int getWaitStatesSince(
50181ad6265SDimitry Andric GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
50281ad6265SDimitry Andric MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
50381ad6265SDimitry Andric IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
50481ad6265SDimitry Andric GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
5050b57cec5SDimitry Andric for (auto E = MBB->instr_rend(); I != E; ++I) {
5060b57cec5SDimitry Andric // Don't add WaitStates for parent BUNDLE instructions.
5070b57cec5SDimitry Andric if (I->isBundle())
5080b57cec5SDimitry Andric continue;
5090b57cec5SDimitry Andric
510fe6060f1SDimitry Andric if (IsHazard(*I))
5110b57cec5SDimitry Andric return WaitStates;
5120b57cec5SDimitry Andric
513349cc55cSDimitry Andric if (I->isInlineAsm())
5140b57cec5SDimitry Andric continue;
5150b57cec5SDimitry Andric
51681ad6265SDimitry Andric WaitStates += GetNumWaitStates(*I);
5170b57cec5SDimitry Andric
518fe6060f1SDimitry Andric if (IsExpired(*I, WaitStates))
5190b57cec5SDimitry Andric return std::numeric_limits<int>::max();
5200b57cec5SDimitry Andric }
5210b57cec5SDimitry Andric
522fe6060f1SDimitry Andric int MinWaitStates = std::numeric_limits<int>::max();
5230b57cec5SDimitry Andric for (MachineBasicBlock *Pred : MBB->predecessors()) {
5240b57cec5SDimitry Andric if (!Visited.insert(Pred).second)
5250b57cec5SDimitry Andric continue;
5260b57cec5SDimitry Andric
52781ad6265SDimitry Andric int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
52881ad6265SDimitry Andric IsExpired, Visited, GetNumWaitStates);
5290b57cec5SDimitry Andric
530fe6060f1SDimitry Andric MinWaitStates = std::min(MinWaitStates, W);
5310b57cec5SDimitry Andric }
5320b57cec5SDimitry Andric
5330b57cec5SDimitry Andric return MinWaitStates;
5340b57cec5SDimitry Andric }
5350b57cec5SDimitry Andric
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineInstr * MI,IsExpiredFn IsExpired)5360b57cec5SDimitry Andric static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
537fe6060f1SDimitry Andric const MachineInstr *MI, IsExpiredFn IsExpired) {
5380b57cec5SDimitry Andric DenseSet<const MachineBasicBlock *> Visited;
5390b57cec5SDimitry Andric return getWaitStatesSince(IsHazard, MI->getParent(),
5400b57cec5SDimitry Andric std::next(MI->getReverseIterator()),
5410b57cec5SDimitry Andric 0, IsExpired, Visited);
5420b57cec5SDimitry Andric }
5430b57cec5SDimitry Andric
getWaitStatesSince(IsHazardFn IsHazard,int Limit)5440b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
5450b57cec5SDimitry Andric if (IsHazardRecognizerMode) {
546fe6060f1SDimitry Andric auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
5470b57cec5SDimitry Andric return WaitStates >= Limit;
5480b57cec5SDimitry Andric };
5490b57cec5SDimitry Andric return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
5500b57cec5SDimitry Andric }
5510b57cec5SDimitry Andric
5520b57cec5SDimitry Andric int WaitStates = 0;
5530b57cec5SDimitry Andric for (MachineInstr *MI : EmittedInstrs) {
5540b57cec5SDimitry Andric if (MI) {
555fe6060f1SDimitry Andric if (IsHazard(*MI))
5560b57cec5SDimitry Andric return WaitStates;
5570b57cec5SDimitry Andric
5580b57cec5SDimitry Andric if (MI->isInlineAsm())
5590b57cec5SDimitry Andric continue;
5600b57cec5SDimitry Andric }
5610b57cec5SDimitry Andric ++WaitStates;
5620b57cec5SDimitry Andric
5630b57cec5SDimitry Andric if (WaitStates >= Limit)
5640b57cec5SDimitry Andric break;
5650b57cec5SDimitry Andric }
5660b57cec5SDimitry Andric return std::numeric_limits<int>::max();
5670b57cec5SDimitry Andric }
5680b57cec5SDimitry Andric
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)5690b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
5700b57cec5SDimitry Andric IsHazardFn IsHazardDef,
5710b57cec5SDimitry Andric int Limit) {
5720b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
5730b57cec5SDimitry Andric
574fe6060f1SDimitry Andric auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
575fe6060f1SDimitry Andric return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
5760b57cec5SDimitry Andric };
5770b57cec5SDimitry Andric
5780b57cec5SDimitry Andric return getWaitStatesSince(IsHazardFn, Limit);
5790b57cec5SDimitry Andric }
5800b57cec5SDimitry Andric
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)5810b57cec5SDimitry Andric int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
5820b57cec5SDimitry Andric int Limit) {
583fe6060f1SDimitry Andric auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
584fe6060f1SDimitry Andric return isSSetReg(MI.getOpcode()) && IsHazard(MI);
5850b57cec5SDimitry Andric };
5860b57cec5SDimitry Andric
5870b57cec5SDimitry Andric return getWaitStatesSince(IsHazardFn, Limit);
5880b57cec5SDimitry Andric }
5890b57cec5SDimitry Andric
5900b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5910b57cec5SDimitry Andric // No-op Hazard Detection
5920b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
5930b57cec5SDimitry Andric
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,MCRegister Reg)594e8d8bef9SDimitry Andric static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
595e8d8bef9SDimitry Andric MCRegister Reg) {
59606c3fb27SDimitry Andric for (MCRegUnit Unit : TRI.regunits(Reg))
59706c3fb27SDimitry Andric BV.set(Unit);
5980b57cec5SDimitry Andric }
5990b57cec5SDimitry Andric
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & DefSet,BitVector & UseSet)6000b57cec5SDimitry Andric static void addRegsToSet(const SIRegisterInfo &TRI,
6010b57cec5SDimitry Andric iterator_range<MachineInstr::const_mop_iterator> Ops,
60206c3fb27SDimitry Andric BitVector &DefSet, BitVector &UseSet) {
6030b57cec5SDimitry Andric for (const MachineOperand &Op : Ops) {
6040b57cec5SDimitry Andric if (Op.isReg())
60506c3fb27SDimitry Andric addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
6060b57cec5SDimitry Andric }
6070b57cec5SDimitry Andric }
6080b57cec5SDimitry Andric
addClauseInst(const MachineInstr & MI)6090b57cec5SDimitry Andric void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
61006c3fb27SDimitry Andric addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
6110b57cec5SDimitry Andric }
6120b57cec5SDimitry Andric
breaksSMEMSoftClause(MachineInstr * MI)6135ffd83dbSDimitry Andric static bool breaksSMEMSoftClause(MachineInstr *MI) {
6145ffd83dbSDimitry Andric return !SIInstrInfo::isSMRD(*MI);
6155ffd83dbSDimitry Andric }
6165ffd83dbSDimitry Andric
breaksVMEMSoftClause(MachineInstr * MI)6175ffd83dbSDimitry Andric static bool breaksVMEMSoftClause(MachineInstr *MI) {
6185ffd83dbSDimitry Andric return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
6195ffd83dbSDimitry Andric }
6205ffd83dbSDimitry Andric
checkSoftClauseHazards(MachineInstr * MEM)6210b57cec5SDimitry Andric int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
6220b57cec5SDimitry Andric // SMEM soft clause are only present on VI+, and only matter if xnack is
6230b57cec5SDimitry Andric // enabled.
6240b57cec5SDimitry Andric if (!ST.isXNACKEnabled())
6250b57cec5SDimitry Andric return 0;
6260b57cec5SDimitry Andric
6270b57cec5SDimitry Andric bool IsSMRD = TII.isSMRD(*MEM);
6280b57cec5SDimitry Andric
6290b57cec5SDimitry Andric resetClause();
6300b57cec5SDimitry Andric
6310b57cec5SDimitry Andric // A soft-clause is any group of consecutive SMEM instructions. The
6320b57cec5SDimitry Andric // instructions in this group may return out of order and/or may be
6330b57cec5SDimitry Andric // replayed (i.e. the same instruction issued more than once).
6340b57cec5SDimitry Andric //
6350b57cec5SDimitry Andric // In order to handle these situations correctly we need to make sure that
6360b57cec5SDimitry Andric // when a clause has more than one instruction, no instruction in the clause
6370b57cec5SDimitry Andric // writes to a register that is read by another instruction in the clause
63881ad6265SDimitry Andric // (including itself). If we encounter this situation, we need to break the
6390b57cec5SDimitry Andric // clause by inserting a non SMEM instruction.
6400b57cec5SDimitry Andric
6410b57cec5SDimitry Andric for (MachineInstr *MI : EmittedInstrs) {
6420b57cec5SDimitry Andric // When we hit a non-SMEM instruction then we have passed the start of the
6430b57cec5SDimitry Andric // clause and we can stop.
6440b57cec5SDimitry Andric if (!MI)
6450b57cec5SDimitry Andric break;
6460b57cec5SDimitry Andric
6475ffd83dbSDimitry Andric if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
6480b57cec5SDimitry Andric break;
6490b57cec5SDimitry Andric
6500b57cec5SDimitry Andric addClauseInst(*MI);
6510b57cec5SDimitry Andric }
6520b57cec5SDimitry Andric
6530b57cec5SDimitry Andric if (ClauseDefs.none())
6540b57cec5SDimitry Andric return 0;
6550b57cec5SDimitry Andric
6560b57cec5SDimitry Andric // We need to make sure not to put loads and stores in the same clause if they
6570b57cec5SDimitry Andric // use the same address. For now, just start a new clause whenever we see a
6580b57cec5SDimitry Andric // store.
6590b57cec5SDimitry Andric if (MEM->mayStore())
6600b57cec5SDimitry Andric return 1;
6610b57cec5SDimitry Andric
6620b57cec5SDimitry Andric addClauseInst(*MEM);
6630b57cec5SDimitry Andric
6640b57cec5SDimitry Andric // If the set of defs and uses intersect then we cannot add this instruction
6650b57cec5SDimitry Andric // to the clause, so we have a hazard.
6660b57cec5SDimitry Andric return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
6670b57cec5SDimitry Andric }
6680b57cec5SDimitry Andric
checkSMRDHazards(MachineInstr * SMRD)6690b57cec5SDimitry Andric int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
6700b57cec5SDimitry Andric int WaitStatesNeeded = 0;
6710b57cec5SDimitry Andric
6720b57cec5SDimitry Andric WaitStatesNeeded = checkSoftClauseHazards(SMRD);
6730b57cec5SDimitry Andric
6740b57cec5SDimitry Andric // This SMRD hazard only affects SI.
6750b57cec5SDimitry Andric if (!ST.hasSMRDReadVALUDefHazard())
6760b57cec5SDimitry Andric return WaitStatesNeeded;
6770b57cec5SDimitry Andric
6780b57cec5SDimitry Andric // A read of an SGPR by SMRD instruction requires 4 wait states when the
6790b57cec5SDimitry Andric // SGPR was written by a VALU instruction.
6800b57cec5SDimitry Andric int SmrdSgprWaitStates = 4;
681fe6060f1SDimitry Andric auto IsHazardDefFn = [this](const MachineInstr &MI) {
682fe6060f1SDimitry Andric return TII.isVALU(MI);
683fe6060f1SDimitry Andric };
684fe6060f1SDimitry Andric auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
685fe6060f1SDimitry Andric return TII.isSALU(MI);
686fe6060f1SDimitry Andric };
6870b57cec5SDimitry Andric
6880b57cec5SDimitry Andric bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
6890b57cec5SDimitry Andric
6900b57cec5SDimitry Andric for (const MachineOperand &Use : SMRD->uses()) {
6910b57cec5SDimitry Andric if (!Use.isReg())
6920b57cec5SDimitry Andric continue;
6930b57cec5SDimitry Andric int WaitStatesNeededForUse =
6940b57cec5SDimitry Andric SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
6950b57cec5SDimitry Andric SmrdSgprWaitStates);
6960b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
6970b57cec5SDimitry Andric
6980b57cec5SDimitry Andric // This fixes what appears to be undocumented hardware behavior in SI where
6990b57cec5SDimitry Andric // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
7000b57cec5SDimitry Andric // needs some number of nops in between. We don't know how many we need, but
7010b57cec5SDimitry Andric // let's use 4. This wasn't discovered before probably because the only
7020b57cec5SDimitry Andric // case when this happens is when we expand a 64-bit pointer into a full
7030b57cec5SDimitry Andric // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
7040b57cec5SDimitry Andric // probably never encountered in the closed-source land.
7050b57cec5SDimitry Andric if (IsBufferSMRD) {
7060b57cec5SDimitry Andric int WaitStatesNeededForUse =
7070b57cec5SDimitry Andric SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
7080b57cec5SDimitry Andric IsBufferHazardDefFn,
7090b57cec5SDimitry Andric SmrdSgprWaitStates);
7100b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7110b57cec5SDimitry Andric }
7120b57cec5SDimitry Andric }
7130b57cec5SDimitry Andric
7140b57cec5SDimitry Andric return WaitStatesNeeded;
7150b57cec5SDimitry Andric }
7160b57cec5SDimitry Andric
checkVMEMHazards(MachineInstr * VMEM)7170b57cec5SDimitry Andric int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
7180b57cec5SDimitry Andric if (!ST.hasVMEMReadSGPRVALUDefHazard())
7190b57cec5SDimitry Andric return 0;
7200b57cec5SDimitry Andric
7210b57cec5SDimitry Andric int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
7220b57cec5SDimitry Andric
7230b57cec5SDimitry Andric // A read of an SGPR by a VMEM instruction requires 5 wait states when the
7240b57cec5SDimitry Andric // SGPR was written by a VALU Instruction.
7250b57cec5SDimitry Andric const int VmemSgprWaitStates = 5;
726fe6060f1SDimitry Andric auto IsHazardDefFn = [this](const MachineInstr &MI) {
727fe6060f1SDimitry Andric return TII.isVALU(MI);
728fe6060f1SDimitry Andric };
7290b57cec5SDimitry Andric for (const MachineOperand &Use : VMEM->uses()) {
730fe6060f1SDimitry Andric if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
7310b57cec5SDimitry Andric continue;
7320b57cec5SDimitry Andric
7330b57cec5SDimitry Andric int WaitStatesNeededForUse =
7340b57cec5SDimitry Andric VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
7350b57cec5SDimitry Andric VmemSgprWaitStates);
7360b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7370b57cec5SDimitry Andric }
7380b57cec5SDimitry Andric return WaitStatesNeeded;
7390b57cec5SDimitry Andric }
7400b57cec5SDimitry Andric
checkDPPHazards(MachineInstr * DPP)7410b57cec5SDimitry Andric int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
7420b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
7430b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
7440b57cec5SDimitry Andric
7450b57cec5SDimitry Andric // Check for DPP VGPR read after VALU VGPR write and EXEC write.
7460b57cec5SDimitry Andric int DppVgprWaitStates = 2;
7470b57cec5SDimitry Andric int DppExecWaitStates = 5;
7480b57cec5SDimitry Andric int WaitStatesNeeded = 0;
749fe6060f1SDimitry Andric auto IsHazardDefFn = [TII](const MachineInstr &MI) {
750fe6060f1SDimitry Andric return TII->isVALU(MI);
751fe6060f1SDimitry Andric };
7520b57cec5SDimitry Andric
7530b57cec5SDimitry Andric for (const MachineOperand &Use : DPP->uses()) {
7540b57cec5SDimitry Andric if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
7550b57cec5SDimitry Andric continue;
7560b57cec5SDimitry Andric int WaitStatesNeededForUse =
757fe6060f1SDimitry Andric DppVgprWaitStates - getWaitStatesSinceDef(
758fe6060f1SDimitry Andric Use.getReg(),
759fe6060f1SDimitry Andric [](const MachineInstr &) { return true; },
7600b57cec5SDimitry Andric DppVgprWaitStates);
7610b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
7620b57cec5SDimitry Andric }
7630b57cec5SDimitry Andric
7640b57cec5SDimitry Andric WaitStatesNeeded = std::max(
7650b57cec5SDimitry Andric WaitStatesNeeded,
7660b57cec5SDimitry Andric DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
7670b57cec5SDimitry Andric DppExecWaitStates));
7680b57cec5SDimitry Andric
7690b57cec5SDimitry Andric return WaitStatesNeeded;
7700b57cec5SDimitry Andric }
7710b57cec5SDimitry Andric
checkDivFMasHazards(MachineInstr * DivFMas)7720b57cec5SDimitry Andric int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
7730b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
7740b57cec5SDimitry Andric
7750b57cec5SDimitry Andric // v_div_fmas requires 4 wait states after a write to vcc from a VALU
7760b57cec5SDimitry Andric // instruction.
7770b57cec5SDimitry Andric const int DivFMasWaitStates = 4;
778fe6060f1SDimitry Andric auto IsHazardDefFn = [TII](const MachineInstr &MI) {
779fe6060f1SDimitry Andric return TII->isVALU(MI);
780fe6060f1SDimitry Andric };
7810b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
7820b57cec5SDimitry Andric DivFMasWaitStates);
7830b57cec5SDimitry Andric
7840b57cec5SDimitry Andric return DivFMasWaitStates - WaitStatesNeeded;
7850b57cec5SDimitry Andric }
7860b57cec5SDimitry Andric
checkGetRegHazards(MachineInstr * GetRegInstr)7870b57cec5SDimitry Andric int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
7880b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
7890b57cec5SDimitry Andric unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
7900b57cec5SDimitry Andric
7910b57cec5SDimitry Andric const int GetRegWaitStates = 2;
792fe6060f1SDimitry Andric auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
793fe6060f1SDimitry Andric return GetRegHWReg == getHWReg(TII, MI);
7940b57cec5SDimitry Andric };
7950b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
7960b57cec5SDimitry Andric
7970b57cec5SDimitry Andric return GetRegWaitStates - WaitStatesNeeded;
7980b57cec5SDimitry Andric }
7990b57cec5SDimitry Andric
checkSetRegHazards(MachineInstr * SetRegInstr)8000b57cec5SDimitry Andric int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
8010b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
8020b57cec5SDimitry Andric unsigned HWReg = getHWReg(TII, *SetRegInstr);
8030b57cec5SDimitry Andric
8040b57cec5SDimitry Andric const int SetRegWaitStates = ST.getSetRegWaitStates();
805fe6060f1SDimitry Andric auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
806fe6060f1SDimitry Andric return HWReg == getHWReg(TII, MI);
8070b57cec5SDimitry Andric };
8080b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
8090b57cec5SDimitry Andric return SetRegWaitStates - WaitStatesNeeded;
8100b57cec5SDimitry Andric }
8110b57cec5SDimitry Andric
createsVALUHazard(const MachineInstr & MI)8120b57cec5SDimitry Andric int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
8130b57cec5SDimitry Andric if (!MI.mayStore())
8140b57cec5SDimitry Andric return -1;
8150b57cec5SDimitry Andric
8160b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
8170b57cec5SDimitry Andric unsigned Opcode = MI.getOpcode();
8180b57cec5SDimitry Andric const MCInstrDesc &Desc = MI.getDesc();
8190b57cec5SDimitry Andric
8200b57cec5SDimitry Andric int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
8210b57cec5SDimitry Andric int VDataRCID = -1;
8220b57cec5SDimitry Andric if (VDataIdx != -1)
823bdd1243dSDimitry Andric VDataRCID = Desc.operands()[VDataIdx].RegClass;
8240b57cec5SDimitry Andric
8250b57cec5SDimitry Andric if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
8260b57cec5SDimitry Andric // There is no hazard if the instruction does not use vector regs
8270b57cec5SDimitry Andric // (like wbinvl1)
8280b57cec5SDimitry Andric if (VDataIdx == -1)
8290b57cec5SDimitry Andric return -1;
8300b57cec5SDimitry Andric // For MUBUF/MTBUF instructions this hazard only exists if the
8310b57cec5SDimitry Andric // instruction is not using a register in the soffset field.
8320b57cec5SDimitry Andric const MachineOperand *SOffset =
8330b57cec5SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
8340b57cec5SDimitry Andric // If we have no soffset operand, then assume this field has been
8350b57cec5SDimitry Andric // hardcoded to zero.
8360b57cec5SDimitry Andric if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
8370b57cec5SDimitry Andric (!SOffset || !SOffset->isReg()))
8380b57cec5SDimitry Andric return VDataIdx;
8390b57cec5SDimitry Andric }
8400b57cec5SDimitry Andric
8410b57cec5SDimitry Andric // MIMG instructions create a hazard if they don't use a 256-bit T# and
8420b57cec5SDimitry Andric // the store size is greater than 8 bytes and they have more than two bits
8430b57cec5SDimitry Andric // of their dmask set.
8440b57cec5SDimitry Andric // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
8450b57cec5SDimitry Andric if (TII->isMIMG(MI)) {
8460b57cec5SDimitry Andric int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
8470b57cec5SDimitry Andric assert(SRsrcIdx != -1 &&
848bdd1243dSDimitry Andric AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
8490b57cec5SDimitry Andric (void)SRsrcIdx;
8500b57cec5SDimitry Andric }
8510b57cec5SDimitry Andric
8520b57cec5SDimitry Andric if (TII->isFLAT(MI)) {
8530b57cec5SDimitry Andric int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
854bdd1243dSDimitry Andric if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
8550b57cec5SDimitry Andric return DataIdx;
8560b57cec5SDimitry Andric }
8570b57cec5SDimitry Andric
8580b57cec5SDimitry Andric return -1;
8590b57cec5SDimitry Andric }
8600b57cec5SDimitry Andric
861e8d8bef9SDimitry Andric int
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)862e8d8bef9SDimitry Andric GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
8630b57cec5SDimitry Andric const MachineRegisterInfo &MRI) {
8640b57cec5SDimitry Andric // Helper to check for the hazard where VMEM instructions that store more than
8650b57cec5SDimitry Andric // 8 bytes can have there store data over written by the next instruction.
8660b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
8670b57cec5SDimitry Andric
86881ad6265SDimitry Andric const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
8690b57cec5SDimitry Andric int WaitStatesNeeded = 0;
8700b57cec5SDimitry Andric
871fe6060f1SDimitry Andric if (!TRI->isVectorRegister(MRI, Def.getReg()))
8720b57cec5SDimitry Andric return WaitStatesNeeded;
8738bcb0991SDimitry Andric Register Reg = Def.getReg();
874fe6060f1SDimitry Andric auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
875fe6060f1SDimitry Andric int DataIdx = createsVALUHazard(MI);
8760b57cec5SDimitry Andric return DataIdx >= 0 &&
877fe6060f1SDimitry Andric TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
8780b57cec5SDimitry Andric };
8790b57cec5SDimitry Andric int WaitStatesNeededForDef =
8800b57cec5SDimitry Andric VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
8810b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
8820b57cec5SDimitry Andric
8830b57cec5SDimitry Andric return WaitStatesNeeded;
8840b57cec5SDimitry Andric }
8850b57cec5SDimitry Andric
checkVALUHazards(MachineInstr * VALU)8860b57cec5SDimitry Andric int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
88781ad6265SDimitry Andric int WaitStatesNeeded = 0;
88881ad6265SDimitry Andric
88981ad6265SDimitry Andric if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
89081ad6265SDimitry Andric const int TransDefWaitstates = 1;
89181ad6265SDimitry Andric
89281ad6265SDimitry Andric auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
89381ad6265SDimitry Andric if (!SIInstrInfo::isTRANS(MI))
89481ad6265SDimitry Andric return false;
89581ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
89681ad6265SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
89781ad6265SDimitry Andric Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
89881ad6265SDimitry Andric
89981ad6265SDimitry Andric for (const MachineOperand &Use : VALU->explicit_uses()) {
90081ad6265SDimitry Andric if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
90181ad6265SDimitry Andric return true;
90281ad6265SDimitry Andric }
90381ad6265SDimitry Andric
90481ad6265SDimitry Andric return false;
90581ad6265SDimitry Andric };
90681ad6265SDimitry Andric
90781ad6265SDimitry Andric int WaitStatesNeededForDef =
90881ad6265SDimitry Andric TransDefWaitstates -
90981ad6265SDimitry Andric getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
91081ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
91181ad6265SDimitry Andric }
91281ad6265SDimitry Andric
91381ad6265SDimitry Andric if (ST.hasDstSelForwardingHazard()) {
91481ad6265SDimitry Andric const int Shift16DefWaitstates = 1;
91581ad6265SDimitry Andric
91681ad6265SDimitry Andric auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
91781ad6265SDimitry Andric if (!SIInstrInfo::isVALU(MI))
91881ad6265SDimitry Andric return false;
91981ad6265SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
92081ad6265SDimitry Andric if (SIInstrInfo::isSDWA(MI)) {
92181ad6265SDimitry Andric if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
92281ad6265SDimitry Andric if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
92381ad6265SDimitry Andric return false;
92481ad6265SDimitry Andric } else {
925bdd1243dSDimitry Andric if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
92681ad6265SDimitry Andric !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
92781ad6265SDimitry Andric ->getImm() &
92881ad6265SDimitry Andric SISrcMods::DST_OP_SEL))
92981ad6265SDimitry Andric return false;
93081ad6265SDimitry Andric }
93181ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
93281ad6265SDimitry Andric if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
93381ad6265SDimitry Andric Register Def = Dst->getReg();
93481ad6265SDimitry Andric
93581ad6265SDimitry Andric for (const MachineOperand &Use : VALU->explicit_uses()) {
93681ad6265SDimitry Andric if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
93781ad6265SDimitry Andric return true;
93881ad6265SDimitry Andric }
93981ad6265SDimitry Andric }
94081ad6265SDimitry Andric
94181ad6265SDimitry Andric return false;
94281ad6265SDimitry Andric };
94381ad6265SDimitry Andric
94481ad6265SDimitry Andric int WaitStatesNeededForDef =
94581ad6265SDimitry Andric Shift16DefWaitstates -
94681ad6265SDimitry Andric getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
94781ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
94881ad6265SDimitry Andric }
94981ad6265SDimitry Andric
95081ad6265SDimitry Andric if (ST.hasVDecCoExecHazard()) {
95181ad6265SDimitry Andric const int VALUWriteSGPRVALUReadWaitstates = 2;
95281ad6265SDimitry Andric const int VALUWriteEXECRWLane = 4;
95381ad6265SDimitry Andric const int VALUWriteVGPRReadlaneRead = 1;
95481ad6265SDimitry Andric
95581ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
95681ad6265SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
95781ad6265SDimitry Andric Register UseReg;
95881ad6265SDimitry Andric auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
95981ad6265SDimitry Andric if (!SIInstrInfo::isVALU(MI))
96081ad6265SDimitry Andric return false;
96181ad6265SDimitry Andric return MI.modifiesRegister(UseReg, TRI);
96281ad6265SDimitry Andric };
96381ad6265SDimitry Andric
96481ad6265SDimitry Andric for (const MachineOperand &Use : VALU->explicit_uses()) {
96581ad6265SDimitry Andric if (!Use.isReg())
96681ad6265SDimitry Andric continue;
96781ad6265SDimitry Andric
96881ad6265SDimitry Andric UseReg = Use.getReg();
96981ad6265SDimitry Andric if (TRI->isSGPRReg(MRI, UseReg)) {
97081ad6265SDimitry Andric int WaitStatesNeededForDef =
97181ad6265SDimitry Andric VALUWriteSGPRVALUReadWaitstates -
97281ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn,
97381ad6265SDimitry Andric VALUWriteSGPRVALUReadWaitstates);
97481ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
97581ad6265SDimitry Andric }
97681ad6265SDimitry Andric }
97781ad6265SDimitry Andric
97881ad6265SDimitry Andric if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
97981ad6265SDimitry Andric UseReg = AMDGPU::VCC;
98081ad6265SDimitry Andric int WaitStatesNeededForDef =
98181ad6265SDimitry Andric VALUWriteSGPRVALUReadWaitstates -
98281ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
98381ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
98481ad6265SDimitry Andric }
98581ad6265SDimitry Andric
98681ad6265SDimitry Andric switch (VALU->getOpcode()) {
98781ad6265SDimitry Andric case AMDGPU::V_READLANE_B32:
98881ad6265SDimitry Andric case AMDGPU::V_READFIRSTLANE_B32: {
98981ad6265SDimitry Andric MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
99081ad6265SDimitry Andric UseReg = Src->getReg();
99181ad6265SDimitry Andric int WaitStatesNeededForDef =
99281ad6265SDimitry Andric VALUWriteVGPRReadlaneRead -
99381ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
99481ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
99581ad6265SDimitry Andric }
996bdd1243dSDimitry Andric [[fallthrough]];
99781ad6265SDimitry Andric case AMDGPU::V_WRITELANE_B32: {
99881ad6265SDimitry Andric UseReg = AMDGPU::EXEC;
99981ad6265SDimitry Andric int WaitStatesNeededForDef =
100081ad6265SDimitry Andric VALUWriteEXECRWLane -
100181ad6265SDimitry Andric getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
100281ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
100381ad6265SDimitry Andric break;
100481ad6265SDimitry Andric }
100581ad6265SDimitry Andric default:
100681ad6265SDimitry Andric break;
100781ad6265SDimitry Andric }
100881ad6265SDimitry Andric }
100981ad6265SDimitry Andric
10100b57cec5SDimitry Andric // This checks for the hazard where VMEM instructions that store more than
10110b57cec5SDimitry Andric // 8 bytes can have there store data over written by the next instruction.
10120b57cec5SDimitry Andric if (!ST.has12DWordStoreHazard())
101381ad6265SDimitry Andric return WaitStatesNeeded;
10140b57cec5SDimitry Andric
10150b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
10160b57cec5SDimitry Andric
10170b57cec5SDimitry Andric for (const MachineOperand &Def : VALU->defs()) {
10180b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
10190b57cec5SDimitry Andric }
10200b57cec5SDimitry Andric
10210b57cec5SDimitry Andric return WaitStatesNeeded;
10220b57cec5SDimitry Andric }
10230b57cec5SDimitry Andric
checkInlineAsmHazards(MachineInstr * IA)10240b57cec5SDimitry Andric int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10250b57cec5SDimitry Andric // This checks for hazards associated with inline asm statements.
10260b57cec5SDimitry Andric // Since inline asms can contain just about anything, we use this
10270b57cec5SDimitry Andric // to call/leverage other check*Hazard routines. Note that
10280b57cec5SDimitry Andric // this function doesn't attempt to address all possible inline asm
10290b57cec5SDimitry Andric // hazards (good luck), but is a collection of what has been
10300b57cec5SDimitry Andric // problematic thus far.
10310b57cec5SDimitry Andric
10320b57cec5SDimitry Andric // see checkVALUHazards()
10330b57cec5SDimitry Andric if (!ST.has12DWordStoreHazard())
10340b57cec5SDimitry Andric return 0;
10350b57cec5SDimitry Andric
10360b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
10370b57cec5SDimitry Andric int WaitStatesNeeded = 0;
10380b57cec5SDimitry Andric
103906c3fb27SDimitry Andric for (const MachineOperand &Op :
104006c3fb27SDimitry Andric llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
10410b57cec5SDimitry Andric if (Op.isReg() && Op.isDef()) {
104206c3fb27SDimitry Andric WaitStatesNeeded =
104306c3fb27SDimitry Andric std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
10440b57cec5SDimitry Andric }
10450b57cec5SDimitry Andric }
10460b57cec5SDimitry Andric
10470b57cec5SDimitry Andric return WaitStatesNeeded;
10480b57cec5SDimitry Andric }
10490b57cec5SDimitry Andric
checkRWLaneHazards(MachineInstr * RWLane)10500b57cec5SDimitry Andric int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
10510b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
10520b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
10530b57cec5SDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
10540b57cec5SDimitry Andric
10550b57cec5SDimitry Andric const MachineOperand *LaneSelectOp =
10560b57cec5SDimitry Andric TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
10570b57cec5SDimitry Andric
10580b57cec5SDimitry Andric if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
10590b57cec5SDimitry Andric return 0;
10600b57cec5SDimitry Andric
10618bcb0991SDimitry Andric Register LaneSelectReg = LaneSelectOp->getReg();
1062fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
10630b57cec5SDimitry Andric
10640b57cec5SDimitry Andric const int RWLaneWaitStates = 4;
10650b57cec5SDimitry Andric int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
10660b57cec5SDimitry Andric RWLaneWaitStates);
10670b57cec5SDimitry Andric return RWLaneWaitStates - WaitStatesSince;
10680b57cec5SDimitry Andric }
10690b57cec5SDimitry Andric
checkRFEHazards(MachineInstr * RFE)10700b57cec5SDimitry Andric int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
10710b57cec5SDimitry Andric if (!ST.hasRFEHazards())
10720b57cec5SDimitry Andric return 0;
10730b57cec5SDimitry Andric
10740b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
10750b57cec5SDimitry Andric
10760b57cec5SDimitry Andric const int RFEWaitStates = 1;
10770b57cec5SDimitry Andric
1078fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &MI) {
1079fe6060f1SDimitry Andric return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
10800b57cec5SDimitry Andric };
10810b57cec5SDimitry Andric int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
10820b57cec5SDimitry Andric return RFEWaitStates - WaitStatesNeeded;
10830b57cec5SDimitry Andric }
10840b57cec5SDimitry Andric
checkReadM0Hazards(MachineInstr * MI)10850b57cec5SDimitry Andric int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
10860b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
108781ad6265SDimitry Andric const int ReadM0WaitStates = 1;
1088fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
108981ad6265SDimitry Andric return ReadM0WaitStates -
109081ad6265SDimitry Andric getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
10910b57cec5SDimitry Andric }
10920b57cec5SDimitry Andric
fixHazards(MachineInstr * MI)10930b57cec5SDimitry Andric void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
10940b57cec5SDimitry Andric fixVMEMtoScalarWriteHazards(MI);
10950b57cec5SDimitry Andric fixVcmpxPermlaneHazards(MI);
10960b57cec5SDimitry Andric fixSMEMtoVectorWriteHazards(MI);
10970b57cec5SDimitry Andric fixVcmpxExecWARHazard(MI);
10980b57cec5SDimitry Andric fixLdsBranchVmemWARHazard(MI);
109981ad6265SDimitry Andric if (ST.hasLdsDirect()) {
110081ad6265SDimitry Andric fixLdsDirectVALUHazard(MI);
110181ad6265SDimitry Andric fixLdsDirectVMEMHazard(MI);
110281ad6265SDimitry Andric }
110381ad6265SDimitry Andric fixVALUPartialForwardingHazard(MI);
110481ad6265SDimitry Andric fixVALUTransUseHazard(MI);
110581ad6265SDimitry Andric fixWMMAHazards(MI);
1106bdd1243dSDimitry Andric fixShift64HighRegBug(MI);
1107bdd1243dSDimitry Andric fixVALUMaskWriteHazard(MI);
1108*0fca6ea1SDimitry Andric fixRequiredExportPriority(MI);
11090b57cec5SDimitry Andric }
11100b57cec5SDimitry Andric
fixVcmpxPermlaneHazards(MachineInstr * MI)11110b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
11120b57cec5SDimitry Andric if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
11130b57cec5SDimitry Andric return false;
11140b57cec5SDimitry Andric
11150b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
111681ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
111781ad6265SDimitry Andric auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
111881ad6265SDimitry Andric return (TII->isVOPC(MI) ||
111981ad6265SDimitry Andric ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
112081ad6265SDimitry Andric MI.modifiesRegister(AMDGPU::EXEC, TRI);
112181ad6265SDimitry Andric };
11220b57cec5SDimitry Andric
1123fe6060f1SDimitry Andric auto IsExpiredFn = [](const MachineInstr &MI, int) {
1124fe6060f1SDimitry Andric unsigned Opc = MI.getOpcode();
1125fe6060f1SDimitry Andric return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1126fe6060f1SDimitry Andric Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
11270b57cec5SDimitry Andric };
11280b57cec5SDimitry Andric
11290b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11300b57cec5SDimitry Andric std::numeric_limits<int>::max())
11310b57cec5SDimitry Andric return false;
11320b57cec5SDimitry Andric
11330b57cec5SDimitry Andric // V_NOP will be discarded by SQ.
113481ad6265SDimitry Andric // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
11350b57cec5SDimitry Andric // which is always a VGPR and available.
11360b57cec5SDimitry Andric auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
11378bcb0991SDimitry Andric Register Reg = Src0->getReg();
11380b57cec5SDimitry Andric bool IsUndef = Src0->isUndef();
11390b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
11400b57cec5SDimitry Andric TII->get(AMDGPU::V_MOV_B32_e32))
11410b57cec5SDimitry Andric .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
11420b57cec5SDimitry Andric .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
11430b57cec5SDimitry Andric
11440b57cec5SDimitry Andric return true;
11450b57cec5SDimitry Andric }
11460b57cec5SDimitry Andric
fixVMEMtoScalarWriteHazards(MachineInstr * MI)11470b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
11480b57cec5SDimitry Andric if (!ST.hasVMEMtoScalarWriteHazard())
11490b57cec5SDimitry Andric return false;
11507a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
11510b57cec5SDimitry Andric
11520b57cec5SDimitry Andric if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
11530b57cec5SDimitry Andric return false;
11540b57cec5SDimitry Andric
11550b57cec5SDimitry Andric if (MI->getNumDefs() == 0)
11560b57cec5SDimitry Andric return false;
11570b57cec5SDimitry Andric
11580b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
11590b57cec5SDimitry Andric
1160fe6060f1SDimitry Andric auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1161fe6060f1SDimitry Andric if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1162fe6060f1SDimitry Andric !SIInstrInfo::isFLAT(I))
11630b57cec5SDimitry Andric return false;
11640b57cec5SDimitry Andric
11650b57cec5SDimitry Andric for (const MachineOperand &Def : MI->defs()) {
1166fe6060f1SDimitry Andric const MachineOperand *Op =
1167*0fca6ea1SDimitry Andric I.findRegisterUseOperand(Def.getReg(), TRI, false);
11680b57cec5SDimitry Andric if (!Op)
11690b57cec5SDimitry Andric continue;
11700b57cec5SDimitry Andric return true;
11710b57cec5SDimitry Andric }
11720b57cec5SDimitry Andric return false;
11730b57cec5SDimitry Andric };
11740b57cec5SDimitry Andric
1175fe6060f1SDimitry Andric auto IsExpiredFn = [](const MachineInstr &MI, int) {
1176fe6060f1SDimitry Andric return SIInstrInfo::isVALU(MI) ||
1177fe6060f1SDimitry Andric (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1178fe6060f1SDimitry Andric !MI.getOperand(0).getImm()) ||
1179fe6060f1SDimitry Andric (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
118006c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
11810b57cec5SDimitry Andric };
11820b57cec5SDimitry Andric
11830b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
11840b57cec5SDimitry Andric std::numeric_limits<int>::max())
11850b57cec5SDimitry Andric return false;
11860b57cec5SDimitry Andric
11870b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
1188e8d8bef9SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1189e8d8bef9SDimitry Andric TII->get(AMDGPU::S_WAITCNT_DEPCTR))
119006c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
11910b57cec5SDimitry Andric return true;
11920b57cec5SDimitry Andric }
11930b57cec5SDimitry Andric
fixSMEMtoVectorWriteHazards(MachineInstr * MI)11940b57cec5SDimitry Andric bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
11950b57cec5SDimitry Andric if (!ST.hasSMEMtoVectorWriteHazard())
11960b57cec5SDimitry Andric return false;
11977a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
11980b57cec5SDimitry Andric
11990b57cec5SDimitry Andric if (!SIInstrInfo::isVALU(*MI))
12000b57cec5SDimitry Andric return false;
12010b57cec5SDimitry Andric
12020b57cec5SDimitry Andric unsigned SDSTName;
12030b57cec5SDimitry Andric switch (MI->getOpcode()) {
12040b57cec5SDimitry Andric case AMDGPU::V_READLANE_B32:
12050b57cec5SDimitry Andric case AMDGPU::V_READFIRSTLANE_B32:
12060b57cec5SDimitry Andric SDSTName = AMDGPU::OpName::vdst;
12070b57cec5SDimitry Andric break;
12080b57cec5SDimitry Andric default:
12090b57cec5SDimitry Andric SDSTName = AMDGPU::OpName::sdst;
12100b57cec5SDimitry Andric break;
12110b57cec5SDimitry Andric }
12120b57cec5SDimitry Andric
12130b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
12140b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
12150b57cec5SDimitry Andric const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
12160b57cec5SDimitry Andric const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
12170b57cec5SDimitry Andric if (!SDST) {
12180b57cec5SDimitry Andric for (const auto &MO : MI->implicit_operands()) {
1219bdd1243dSDimitry Andric if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
12200b57cec5SDimitry Andric SDST = &MO;
12210b57cec5SDimitry Andric break;
12220b57cec5SDimitry Andric }
12230b57cec5SDimitry Andric }
12240b57cec5SDimitry Andric }
12250b57cec5SDimitry Andric
12260b57cec5SDimitry Andric if (!SDST)
12270b57cec5SDimitry Andric return false;
12280b57cec5SDimitry Andric
12298bcb0991SDimitry Andric const Register SDSTReg = SDST->getReg();
1230fe6060f1SDimitry Andric auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1231fe6060f1SDimitry Andric return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
12320b57cec5SDimitry Andric };
12330b57cec5SDimitry Andric
1234fe6060f1SDimitry Andric auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1235fe6060f1SDimitry Andric if (TII->isSALU(MI)) {
1236fe6060f1SDimitry Andric switch (MI.getOpcode()) {
12370b57cec5SDimitry Andric case AMDGPU::S_SETVSKIP:
12380b57cec5SDimitry Andric case AMDGPU::S_VERSION:
12390b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VSCNT:
12400b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VMCNT:
12410b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_EXPCNT:
12420b57cec5SDimitry Andric // These instructions cannot not mitigate the hazard.
12430b57cec5SDimitry Andric return false;
12440b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_LGKMCNT:
12450b57cec5SDimitry Andric // Reducing lgkmcnt count to 0 always mitigates the hazard.
1246fe6060f1SDimitry Andric return (MI.getOperand(1).getImm() == 0) &&
1247fe6060f1SDimitry Andric (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
12480b57cec5SDimitry Andric case AMDGPU::S_WAITCNT: {
1249fe6060f1SDimitry Andric const int64_t Imm = MI.getOperand(0).getImm();
12500b57cec5SDimitry Andric AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
12517a6dacacSDimitry Andric // DsCnt corresponds to LGKMCnt here.
12527a6dacacSDimitry Andric return (Decoded.DsCnt == 0);
12530b57cec5SDimitry Andric }
12540b57cec5SDimitry Andric default:
12550b57cec5SDimitry Andric // SOPP instructions cannot mitigate the hazard.
1256fe6060f1SDimitry Andric if (TII->isSOPP(MI))
12570b57cec5SDimitry Andric return false;
12580b57cec5SDimitry Andric // At this point the SALU can be assumed to mitigate the hazard
12590b57cec5SDimitry Andric // because either:
12600b57cec5SDimitry Andric // (a) it is independent of the at risk SMEM (breaking chain),
12610b57cec5SDimitry Andric // or
12620b57cec5SDimitry Andric // (b) it is dependent on the SMEM, in which case an appropriate
12630b57cec5SDimitry Andric // s_waitcnt lgkmcnt _must_ exist between it and the at risk
12640b57cec5SDimitry Andric // SMEM instruction.
12650b57cec5SDimitry Andric return true;
12660b57cec5SDimitry Andric }
12670b57cec5SDimitry Andric }
12680b57cec5SDimitry Andric return false;
12690b57cec5SDimitry Andric };
12700b57cec5SDimitry Andric
12710b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
12720b57cec5SDimitry Andric std::numeric_limits<int>::max())
12730b57cec5SDimitry Andric return false;
12740b57cec5SDimitry Andric
12750b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
12760b57cec5SDimitry Andric TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
12770b57cec5SDimitry Andric .addImm(0);
12780b57cec5SDimitry Andric return true;
12790b57cec5SDimitry Andric }
12800b57cec5SDimitry Andric
fixVcmpxExecWARHazard(MachineInstr * MI)12810b57cec5SDimitry Andric bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
12827a6dacacSDimitry Andric if (!ST.hasVcmpxExecWARHazard())
12837a6dacacSDimitry Andric return false;
12847a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
12857a6dacacSDimitry Andric
12867a6dacacSDimitry Andric if (!SIInstrInfo::isVALU(*MI))
12870b57cec5SDimitry Andric return false;
12880b57cec5SDimitry Andric
12890b57cec5SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
12900b57cec5SDimitry Andric if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
12910b57cec5SDimitry Andric return false;
12920b57cec5SDimitry Andric
1293fe6060f1SDimitry Andric auto IsHazardFn = [TRI](const MachineInstr &I) {
1294fe6060f1SDimitry Andric if (SIInstrInfo::isVALU(I))
12950b57cec5SDimitry Andric return false;
1296fe6060f1SDimitry Andric return I.readsRegister(AMDGPU::EXEC, TRI);
12970b57cec5SDimitry Andric };
12980b57cec5SDimitry Andric
12990b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
1300fe6060f1SDimitry Andric auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1301fe6060f1SDimitry Andric if (SIInstrInfo::isVALU(MI)) {
1302fe6060f1SDimitry Andric if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
13030b57cec5SDimitry Andric return true;
1304fe6060f1SDimitry Andric for (auto MO : MI.implicit_operands())
1305bdd1243dSDimitry Andric if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
13060b57cec5SDimitry Andric return true;
13070b57cec5SDimitry Andric }
1308fe6060f1SDimitry Andric if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
130906c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
13100b57cec5SDimitry Andric return true;
13110b57cec5SDimitry Andric return false;
13120b57cec5SDimitry Andric };
13130b57cec5SDimitry Andric
13140b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13150b57cec5SDimitry Andric std::numeric_limits<int>::max())
13160b57cec5SDimitry Andric return false;
13170b57cec5SDimitry Andric
13180b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
13190b57cec5SDimitry Andric TII->get(AMDGPU::S_WAITCNT_DEPCTR))
132006c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
13210b57cec5SDimitry Andric return true;
13220b57cec5SDimitry Andric }
13230b57cec5SDimitry Andric
shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction & MF,const GCNSubtarget & ST)1324fe6060f1SDimitry Andric static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1325fe6060f1SDimitry Andric const GCNSubtarget &ST) {
13260b57cec5SDimitry Andric if (!ST.hasLdsBranchVmemWARHazard())
13270b57cec5SDimitry Andric return false;
13280b57cec5SDimitry Andric
1329fe6060f1SDimitry Andric // Check if the necessary condition for the hazard is met: both LDS and VMEM
1330fe6060f1SDimitry Andric // instructions need to appear in the same function.
1331fe6060f1SDimitry Andric bool HasLds = false;
1332fe6060f1SDimitry Andric bool HasVmem = false;
1333fe6060f1SDimitry Andric for (auto &MBB : MF) {
1334fe6060f1SDimitry Andric for (auto &MI : MBB) {
1335fe6060f1SDimitry Andric HasLds |= SIInstrInfo::isDS(MI);
1336fe6060f1SDimitry Andric HasVmem |=
1337fe6060f1SDimitry Andric SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1338fe6060f1SDimitry Andric if (HasLds && HasVmem)
1339fe6060f1SDimitry Andric return true;
1340fe6060f1SDimitry Andric }
1341fe6060f1SDimitry Andric }
1342fe6060f1SDimitry Andric return false;
1343fe6060f1SDimitry Andric }
1344fe6060f1SDimitry Andric
isStoreCountWaitZero(const MachineInstr & I)1345bdd1243dSDimitry Andric static bool isStoreCountWaitZero(const MachineInstr &I) {
1346bdd1243dSDimitry Andric return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1347bdd1243dSDimitry Andric I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1348bdd1243dSDimitry Andric !I.getOperand(1).getImm();
1349bdd1243dSDimitry Andric }
1350bdd1243dSDimitry Andric
fixLdsBranchVmemWARHazard(MachineInstr * MI)1351fe6060f1SDimitry Andric bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1352fe6060f1SDimitry Andric if (!RunLdsBranchVmemWARHazardFixup)
1353fe6060f1SDimitry Andric return false;
1354fe6060f1SDimitry Andric
1355fe6060f1SDimitry Andric assert(ST.hasLdsBranchVmemWARHazard());
13567a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
1357fe6060f1SDimitry Andric
1358fe6060f1SDimitry Andric auto IsHazardInst = [](const MachineInstr &MI) {
1359fe6060f1SDimitry Andric if (SIInstrInfo::isDS(MI))
13600b57cec5SDimitry Andric return 1;
1361fe6060f1SDimitry Andric if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
13620b57cec5SDimitry Andric return 2;
13630b57cec5SDimitry Andric return 0;
13640b57cec5SDimitry Andric };
13650b57cec5SDimitry Andric
1366fe6060f1SDimitry Andric auto InstType = IsHazardInst(*MI);
13670b57cec5SDimitry Andric if (!InstType)
13680b57cec5SDimitry Andric return false;
13690b57cec5SDimitry Andric
1370fe6060f1SDimitry Andric auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1371bdd1243dSDimitry Andric return IsHazardInst(I) || isStoreCountWaitZero(I);
13720b57cec5SDimitry Andric };
13730b57cec5SDimitry Andric
1374fe6060f1SDimitry Andric auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1375fe6060f1SDimitry Andric if (!I.isBranch())
13760b57cec5SDimitry Andric return false;
13770b57cec5SDimitry Andric
1378fe6060f1SDimitry Andric auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
13790b57cec5SDimitry Andric auto InstType2 = IsHazardInst(I);
13800b57cec5SDimitry Andric return InstType2 && InstType != InstType2;
13810b57cec5SDimitry Andric };
13820b57cec5SDimitry Andric
1383fe6060f1SDimitry Andric auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
13840b57cec5SDimitry Andric auto InstType2 = IsHazardInst(I);
13850b57cec5SDimitry Andric if (InstType == InstType2)
13860b57cec5SDimitry Andric return true;
13870b57cec5SDimitry Andric
1388bdd1243dSDimitry Andric return isStoreCountWaitZero(I);
13890b57cec5SDimitry Andric };
13900b57cec5SDimitry Andric
1391fe6060f1SDimitry Andric return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
13920b57cec5SDimitry Andric std::numeric_limits<int>::max();
13930b57cec5SDimitry Andric };
13940b57cec5SDimitry Andric
13950b57cec5SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
13960b57cec5SDimitry Andric std::numeric_limits<int>::max())
13970b57cec5SDimitry Andric return false;
13980b57cec5SDimitry Andric
13990b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
14000b57cec5SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
14010b57cec5SDimitry Andric TII->get(AMDGPU::S_WAITCNT_VSCNT))
14020b57cec5SDimitry Andric .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
14030b57cec5SDimitry Andric .addImm(0);
14040b57cec5SDimitry Andric
14050b57cec5SDimitry Andric return true;
14060b57cec5SDimitry Andric }
14070b57cec5SDimitry Andric
fixLdsDirectVALUHazard(MachineInstr * MI)140881ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
140981ad6265SDimitry Andric if (!SIInstrInfo::isLDSDIR(*MI))
141081ad6265SDimitry Andric return false;
141181ad6265SDimitry Andric
141281ad6265SDimitry Andric const int NoHazardWaitStates = 15;
141381ad6265SDimitry Andric const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
141481ad6265SDimitry Andric const Register VDSTReg = VDST->getReg();
141581ad6265SDimitry Andric
141681ad6265SDimitry Andric bool VisitedTrans = false;
141781ad6265SDimitry Andric auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
141881ad6265SDimitry Andric if (!SIInstrInfo::isVALU(I))
141981ad6265SDimitry Andric return false;
142081ad6265SDimitry Andric VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
142181ad6265SDimitry Andric // Cover both WAR and WAW
142281ad6265SDimitry Andric return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
142381ad6265SDimitry Andric };
142481ad6265SDimitry Andric auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
142581ad6265SDimitry Andric if (WaitStates >= NoHazardWaitStates)
142681ad6265SDimitry Andric return true;
142781ad6265SDimitry Andric // Instructions which cause va_vdst==0 expire hazard
142881ad6265SDimitry Andric return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
142981ad6265SDimitry Andric SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
143081ad6265SDimitry Andric };
143181ad6265SDimitry Andric auto GetWaitStatesFn = [](const MachineInstr &MI) {
143281ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) ? 1 : 0;
143381ad6265SDimitry Andric };
143481ad6265SDimitry Andric
143581ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> Visited;
143681ad6265SDimitry Andric auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
143781ad6265SDimitry Andric std::next(MI->getReverseIterator()), 0,
143881ad6265SDimitry Andric IsExpiredFn, Visited, GetWaitStatesFn);
143981ad6265SDimitry Andric
144081ad6265SDimitry Andric // Transcendentals can execute in parallel to other VALUs.
144181ad6265SDimitry Andric // This makes va_vdst count unusable with a mixture of VALU and TRANS.
144281ad6265SDimitry Andric if (VisitedTrans)
144381ad6265SDimitry Andric Count = 0;
144481ad6265SDimitry Andric
144581ad6265SDimitry Andric MachineOperand *WaitVdstOp =
144681ad6265SDimitry Andric TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
144781ad6265SDimitry Andric WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
144881ad6265SDimitry Andric
144981ad6265SDimitry Andric return true;
145081ad6265SDimitry Andric }
145181ad6265SDimitry Andric
fixLdsDirectVMEMHazard(MachineInstr * MI)145281ad6265SDimitry Andric bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
145381ad6265SDimitry Andric if (!SIInstrInfo::isLDSDIR(*MI))
145481ad6265SDimitry Andric return false;
145581ad6265SDimitry Andric
145681ad6265SDimitry Andric const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
145781ad6265SDimitry Andric const Register VDSTReg = VDST->getReg();
145881ad6265SDimitry Andric
145981ad6265SDimitry Andric auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
146081ad6265SDimitry Andric if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
146181ad6265SDimitry Andric !SIInstrInfo::isDS(I))
146281ad6265SDimitry Andric return false;
146381ad6265SDimitry Andric return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
146481ad6265SDimitry Andric };
1465297eecfbSDimitry Andric bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
14667a6dacacSDimitry Andric // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
14677a6dacacSDimitry Andric // according to the type of VMEM instruction.
1468297eecfbSDimitry Andric auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
146981ad6265SDimitry Andric return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
147081ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
147181ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1472297eecfbSDimitry Andric AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1473297eecfbSDimitry Andric (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1474297eecfbSDimitry Andric !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
147581ad6265SDimitry Andric };
147681ad6265SDimitry Andric
147781ad6265SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
147881ad6265SDimitry Andric std::numeric_limits<int>::max())
147981ad6265SDimitry Andric return false;
148081ad6265SDimitry Andric
1481297eecfbSDimitry Andric if (LdsdirCanWait) {
1482297eecfbSDimitry Andric TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1483297eecfbSDimitry Andric } else {
148481ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
148581ad6265SDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR))
148606c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1487297eecfbSDimitry Andric }
148881ad6265SDimitry Andric
148981ad6265SDimitry Andric return true;
149081ad6265SDimitry Andric }
149181ad6265SDimitry Andric
fixVALUPartialForwardingHazard(MachineInstr * MI)149281ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
149381ad6265SDimitry Andric if (!ST.hasVALUPartialForwardingHazard())
149481ad6265SDimitry Andric return false;
14957a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
14967a6dacacSDimitry Andric
14977a6dacacSDimitry Andric if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
149881ad6265SDimitry Andric return false;
149981ad6265SDimitry Andric
150081ad6265SDimitry Andric SmallSetVector<Register, 4> SrcVGPRs;
150181ad6265SDimitry Andric
150281ad6265SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) {
150381ad6265SDimitry Andric if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
150481ad6265SDimitry Andric SrcVGPRs.insert(Use.getReg());
150581ad6265SDimitry Andric }
150681ad6265SDimitry Andric
150781ad6265SDimitry Andric // Only applies with >= 2 unique VGPR sources
150881ad6265SDimitry Andric if (SrcVGPRs.size() <= 1)
150981ad6265SDimitry Andric return false;
151081ad6265SDimitry Andric
151181ad6265SDimitry Andric // Look for the following pattern:
151281ad6265SDimitry Andric // Va <- VALU [PreExecPos]
151381ad6265SDimitry Andric // intv1
151481ad6265SDimitry Andric // Exec <- SALU [ExecPos]
151581ad6265SDimitry Andric // intv2
151681ad6265SDimitry Andric // Vb <- VALU [PostExecPos]
151781ad6265SDimitry Andric // intv3
151881ad6265SDimitry Andric // MI Va, Vb (WaitState = 0)
151981ad6265SDimitry Andric //
152081ad6265SDimitry Andric // Where:
152181ad6265SDimitry Andric // intv1 + intv2 <= 2 VALUs
152281ad6265SDimitry Andric // intv3 <= 4 VALUs
152381ad6265SDimitry Andric //
152481ad6265SDimitry Andric // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
152581ad6265SDimitry Andric
152681ad6265SDimitry Andric const int Intv1plus2MaxVALUs = 2;
152781ad6265SDimitry Andric const int Intv3MaxVALUs = 4;
152881ad6265SDimitry Andric const int IntvMaxVALUs = 6;
152981ad6265SDimitry Andric const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
153081ad6265SDimitry Andric
153181ad6265SDimitry Andric struct StateType {
153281ad6265SDimitry Andric SmallDenseMap<Register, int, 4> DefPos;
153381ad6265SDimitry Andric int ExecPos = std::numeric_limits<int>::max();
153481ad6265SDimitry Andric int VALUs = 0;
153581ad6265SDimitry Andric };
153681ad6265SDimitry Andric
153781ad6265SDimitry Andric StateType State;
153881ad6265SDimitry Andric
153981ad6265SDimitry Andric // This overloads expiry testing with all the hazard detection
154081ad6265SDimitry Andric auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
154181ad6265SDimitry Andric // Too many VALU states have passed
154281ad6265SDimitry Andric if (State.VALUs > NoHazardVALUWaitStates)
154381ad6265SDimitry Andric return HazardExpired;
154481ad6265SDimitry Andric
154581ad6265SDimitry Andric // Instructions which cause va_vdst==0 expire hazard
154681ad6265SDimitry Andric if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
154781ad6265SDimitry Andric SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
154881ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
154906c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
155081ad6265SDimitry Andric return HazardExpired;
155181ad6265SDimitry Andric
155281ad6265SDimitry Andric // Track registers writes
155381ad6265SDimitry Andric bool Changed = false;
155481ad6265SDimitry Andric if (SIInstrInfo::isVALU(I)) {
155581ad6265SDimitry Andric for (Register Src : SrcVGPRs) {
155681ad6265SDimitry Andric if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
155781ad6265SDimitry Andric State.DefPos[Src] = State.VALUs;
155881ad6265SDimitry Andric Changed = true;
155981ad6265SDimitry Andric }
156081ad6265SDimitry Andric }
156181ad6265SDimitry Andric } else if (SIInstrInfo::isSALU(I)) {
156281ad6265SDimitry Andric if (State.ExecPos == std::numeric_limits<int>::max()) {
156381ad6265SDimitry Andric if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
156481ad6265SDimitry Andric State.ExecPos = State.VALUs;
156581ad6265SDimitry Andric Changed = true;
156681ad6265SDimitry Andric }
156781ad6265SDimitry Andric }
156881ad6265SDimitry Andric }
156981ad6265SDimitry Andric
157081ad6265SDimitry Andric // Early expiration: too many VALUs in intv3
157181ad6265SDimitry Andric if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
157281ad6265SDimitry Andric return HazardExpired;
157381ad6265SDimitry Andric
157481ad6265SDimitry Andric // Only evaluate state if something changed
157581ad6265SDimitry Andric if (!Changed)
157681ad6265SDimitry Andric return NoHazardFound;
157781ad6265SDimitry Andric
157881ad6265SDimitry Andric // Determine positions of VALUs pre/post exec change
157981ad6265SDimitry Andric if (State.ExecPos == std::numeric_limits<int>::max())
158081ad6265SDimitry Andric return NoHazardFound;
158181ad6265SDimitry Andric
158281ad6265SDimitry Andric int PreExecPos = std::numeric_limits<int>::max();
158381ad6265SDimitry Andric int PostExecPos = std::numeric_limits<int>::max();
158481ad6265SDimitry Andric
158581ad6265SDimitry Andric for (auto Entry : State.DefPos) {
158681ad6265SDimitry Andric int DefVALUs = Entry.second;
158781ad6265SDimitry Andric if (DefVALUs != std::numeric_limits<int>::max()) {
158881ad6265SDimitry Andric if (DefVALUs >= State.ExecPos)
158981ad6265SDimitry Andric PreExecPos = std::min(PreExecPos, DefVALUs);
1590*0fca6ea1SDimitry Andric else
159181ad6265SDimitry Andric PostExecPos = std::min(PostExecPos, DefVALUs);
159281ad6265SDimitry Andric }
159381ad6265SDimitry Andric }
159481ad6265SDimitry Andric
159581ad6265SDimitry Andric // Need a VALUs post exec change
159681ad6265SDimitry Andric if (PostExecPos == std::numeric_limits<int>::max())
159781ad6265SDimitry Andric return NoHazardFound;
159881ad6265SDimitry Andric
159981ad6265SDimitry Andric // Too many VALUs in intv3?
160081ad6265SDimitry Andric int Intv3VALUs = PostExecPos;
160181ad6265SDimitry Andric if (Intv3VALUs > Intv3MaxVALUs)
160281ad6265SDimitry Andric return HazardExpired;
160381ad6265SDimitry Andric
160481ad6265SDimitry Andric // Too many VALUs in intv2?
160581ad6265SDimitry Andric int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
160681ad6265SDimitry Andric if (Intv2VALUs > Intv1plus2MaxVALUs)
160781ad6265SDimitry Andric return HazardExpired;
160881ad6265SDimitry Andric
160981ad6265SDimitry Andric // Need a VALUs pre exec change
161081ad6265SDimitry Andric if (PreExecPos == std::numeric_limits<int>::max())
161181ad6265SDimitry Andric return NoHazardFound;
161281ad6265SDimitry Andric
161381ad6265SDimitry Andric // Too many VALUs in intv1?
161481ad6265SDimitry Andric int Intv1VALUs = PreExecPos - State.ExecPos;
161581ad6265SDimitry Andric if (Intv1VALUs > Intv1plus2MaxVALUs)
161681ad6265SDimitry Andric return HazardExpired;
161781ad6265SDimitry Andric
161881ad6265SDimitry Andric // Too many VALUs in intv1 + intv2
161981ad6265SDimitry Andric if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
162081ad6265SDimitry Andric return HazardExpired;
162181ad6265SDimitry Andric
162281ad6265SDimitry Andric return HazardFound;
162381ad6265SDimitry Andric };
162481ad6265SDimitry Andric auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
162581ad6265SDimitry Andric if (SIInstrInfo::isVALU(MI))
162681ad6265SDimitry Andric State.VALUs += 1;
162781ad6265SDimitry Andric };
162881ad6265SDimitry Andric
162981ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> Visited;
163081ad6265SDimitry Andric if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
163181ad6265SDimitry Andric std::next(MI->getReverseIterator()), Visited))
163281ad6265SDimitry Andric return false;
163381ad6265SDimitry Andric
163481ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
163581ad6265SDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR))
163681ad6265SDimitry Andric .addImm(0x0fff);
163781ad6265SDimitry Andric
163881ad6265SDimitry Andric return true;
163981ad6265SDimitry Andric }
164081ad6265SDimitry Andric
fixVALUTransUseHazard(MachineInstr * MI)164181ad6265SDimitry Andric bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
164281ad6265SDimitry Andric if (!ST.hasVALUTransUseHazard())
164381ad6265SDimitry Andric return false;
16447a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
16457a6dacacSDimitry Andric
164681ad6265SDimitry Andric if (!SIInstrInfo::isVALU(*MI))
164781ad6265SDimitry Andric return false;
164881ad6265SDimitry Andric
164981ad6265SDimitry Andric SmallSet<Register, 4> SrcVGPRs;
165081ad6265SDimitry Andric
165181ad6265SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) {
165281ad6265SDimitry Andric if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
165381ad6265SDimitry Andric SrcVGPRs.insert(Use.getReg());
165481ad6265SDimitry Andric }
165581ad6265SDimitry Andric
165681ad6265SDimitry Andric // Look for the following pattern:
165781ad6265SDimitry Andric // Va <- TRANS VALU
165881ad6265SDimitry Andric // intv
165981ad6265SDimitry Andric // MI Va (WaitState = 0)
166081ad6265SDimitry Andric //
166181ad6265SDimitry Andric // Where:
166281ad6265SDimitry Andric // intv <= 5 VALUs / 1 TRANS
166381ad6265SDimitry Andric //
166481ad6265SDimitry Andric // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
166581ad6265SDimitry Andric
166681ad6265SDimitry Andric const int IntvMaxVALUs = 5;
166781ad6265SDimitry Andric const int IntvMaxTRANS = 1;
166881ad6265SDimitry Andric
166981ad6265SDimitry Andric struct StateType {
167081ad6265SDimitry Andric int VALUs = 0;
167181ad6265SDimitry Andric int TRANS = 0;
167281ad6265SDimitry Andric };
167381ad6265SDimitry Andric
167481ad6265SDimitry Andric StateType State;
167581ad6265SDimitry Andric
167681ad6265SDimitry Andric // This overloads expiry testing with all the hazard detection
167781ad6265SDimitry Andric auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
167881ad6265SDimitry Andric // Too many VALU states have passed
167981ad6265SDimitry Andric if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
168081ad6265SDimitry Andric return HazardExpired;
168181ad6265SDimitry Andric
168281ad6265SDimitry Andric // Instructions which cause va_vdst==0 expire hazard
168381ad6265SDimitry Andric if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
168481ad6265SDimitry Andric SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
168581ad6265SDimitry Andric (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
168681ad6265SDimitry Andric I.getOperand(0).getImm() == 0x0fff))
168781ad6265SDimitry Andric return HazardExpired;
168881ad6265SDimitry Andric
168981ad6265SDimitry Andric // Track registers writes
169081ad6265SDimitry Andric if (SIInstrInfo::isTRANS(I)) {
169181ad6265SDimitry Andric for (Register Src : SrcVGPRs) {
169281ad6265SDimitry Andric if (I.modifiesRegister(Src, &TRI)) {
169381ad6265SDimitry Andric return HazardFound;
169481ad6265SDimitry Andric }
169581ad6265SDimitry Andric }
169681ad6265SDimitry Andric }
169781ad6265SDimitry Andric
169881ad6265SDimitry Andric return NoHazardFound;
169981ad6265SDimitry Andric };
170081ad6265SDimitry Andric auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
170181ad6265SDimitry Andric if (SIInstrInfo::isVALU(MI))
170281ad6265SDimitry Andric State.VALUs += 1;
170381ad6265SDimitry Andric if (SIInstrInfo::isTRANS(MI))
170481ad6265SDimitry Andric State.TRANS += 1;
170581ad6265SDimitry Andric };
170681ad6265SDimitry Andric
170781ad6265SDimitry Andric DenseSet<const MachineBasicBlock *> Visited;
170881ad6265SDimitry Andric if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
170981ad6265SDimitry Andric std::next(MI->getReverseIterator()), Visited))
171081ad6265SDimitry Andric return false;
171181ad6265SDimitry Andric
171281ad6265SDimitry Andric // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
171306c3fb27SDimitry Andric // avoided.
171481ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
171581ad6265SDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR))
171606c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
171781ad6265SDimitry Andric
171881ad6265SDimitry Andric return true;
171981ad6265SDimitry Andric }
172081ad6265SDimitry Andric
fixWMMAHazards(MachineInstr * MI)172181ad6265SDimitry Andric bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1722b3edf446SDimitry Andric if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
172381ad6265SDimitry Andric return false;
172481ad6265SDimitry Andric
172581ad6265SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
172681ad6265SDimitry Andric const SIRegisterInfo *TRI = ST.getRegisterInfo();
172781ad6265SDimitry Andric
1728b3edf446SDimitry Andric auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1729b3edf446SDimitry Andric if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
173081ad6265SDimitry Andric return false;
173181ad6265SDimitry Andric
1732*0fca6ea1SDimitry Andric // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1733*0fca6ea1SDimitry Andric // with the dest(matrix D) of the previous wmma.
173481ad6265SDimitry Andric const Register CurSrc0Reg =
173581ad6265SDimitry Andric TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
173681ad6265SDimitry Andric const Register CurSrc1Reg =
173781ad6265SDimitry Andric TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
173881ad6265SDimitry Andric
173981ad6265SDimitry Andric const Register PrevDstReg =
174081ad6265SDimitry Andric TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
174181ad6265SDimitry Andric
174281ad6265SDimitry Andric if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
174381ad6265SDimitry Andric TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
174481ad6265SDimitry Andric return true;
174581ad6265SDimitry Andric }
174681ad6265SDimitry Andric
1747b3edf446SDimitry Andric // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1748b3edf446SDimitry Andric // but Index can't overlap with PrevDstReg.
1749b3edf446SDimitry Andric if (AMDGPU::isGFX12Plus(ST)) {
1750b3edf446SDimitry Andric if (SIInstrInfo::isSWMMAC(*MI)) {
1751b3edf446SDimitry Andric const Register CurIndex =
1752b3edf446SDimitry Andric TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1753b3edf446SDimitry Andric if (TRI->regsOverlap(PrevDstReg, CurIndex))
1754b3edf446SDimitry Andric return true;
1755b3edf446SDimitry Andric }
1756b3edf446SDimitry Andric return false;
1757b3edf446SDimitry Andric }
1758b3edf446SDimitry Andric
175981ad6265SDimitry Andric return false;
176081ad6265SDimitry Andric };
176181ad6265SDimitry Andric
176281ad6265SDimitry Andric auto IsExpiredFn = [](const MachineInstr &I, int) {
176381ad6265SDimitry Andric return SIInstrInfo::isVALU(I);
176481ad6265SDimitry Andric };
176581ad6265SDimitry Andric
176681ad6265SDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
176781ad6265SDimitry Andric std::numeric_limits<int>::max())
176881ad6265SDimitry Andric return false;
176981ad6265SDimitry Andric
177081ad6265SDimitry Andric BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
177181ad6265SDimitry Andric
177281ad6265SDimitry Andric return true;
177381ad6265SDimitry Andric }
177481ad6265SDimitry Andric
fixShift64HighRegBug(MachineInstr * MI)1775bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1776bdd1243dSDimitry Andric if (!ST.hasShift64HighRegBug())
1777bdd1243dSDimitry Andric return false;
17787a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
1779bdd1243dSDimitry Andric
1780bdd1243dSDimitry Andric switch (MI->getOpcode()) {
1781bdd1243dSDimitry Andric default:
1782bdd1243dSDimitry Andric return false;
1783bdd1243dSDimitry Andric case AMDGPU::V_LSHLREV_B64_e64:
1784bdd1243dSDimitry Andric case AMDGPU::V_LSHRREV_B64_e64:
1785bdd1243dSDimitry Andric case AMDGPU::V_ASHRREV_I64_e64:
1786bdd1243dSDimitry Andric break;
1787bdd1243dSDimitry Andric }
1788bdd1243dSDimitry Andric
1789bdd1243dSDimitry Andric MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1790bdd1243dSDimitry Andric if (!Amt->isReg())
1791bdd1243dSDimitry Andric return false;
1792bdd1243dSDimitry Andric
1793bdd1243dSDimitry Andric Register AmtReg = Amt->getReg();
1794bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
1795bdd1243dSDimitry Andric // Check if this is a last VGPR in the allocation block.
1796bdd1243dSDimitry Andric if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1797bdd1243dSDimitry Andric return false;
1798bdd1243dSDimitry Andric
1799bdd1243dSDimitry Andric if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1800bdd1243dSDimitry Andric return false;
1801bdd1243dSDimitry Andric
1802bdd1243dSDimitry Andric MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1803bdd1243dSDimitry Andric bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1804bdd1243dSDimitry Andric bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1805bdd1243dSDimitry Andric bool Overlapped = OverlappedSrc || OverlappedDst;
1806bdd1243dSDimitry Andric
1807bdd1243dSDimitry Andric assert(!OverlappedDst || !OverlappedSrc ||
1808bdd1243dSDimitry Andric Src1->getReg() == MI->getOperand(0).getReg());
1809bdd1243dSDimitry Andric assert(ST.needsAlignedVGPRs());
1810bdd1243dSDimitry Andric static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1811bdd1243dSDimitry Andric
1812bdd1243dSDimitry Andric Register NewReg;
1813bdd1243dSDimitry Andric for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1814bdd1243dSDimitry Andric : AMDGPU::VGPR_32RegClass) {
1815bdd1243dSDimitry Andric if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1816bdd1243dSDimitry Andric NewReg = Reg;
1817bdd1243dSDimitry Andric break;
1818bdd1243dSDimitry Andric }
1819bdd1243dSDimitry Andric }
1820bdd1243dSDimitry Andric
1821bdd1243dSDimitry Andric Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1822bdd1243dSDimitry Andric : NewReg;
1823bdd1243dSDimitry Andric Register NewAmtLo;
1824bdd1243dSDimitry Andric
1825bdd1243dSDimitry Andric if (Overlapped)
1826bdd1243dSDimitry Andric NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1827bdd1243dSDimitry Andric
1828bdd1243dSDimitry Andric DebugLoc DL = MI->getDebugLoc();
1829bdd1243dSDimitry Andric MachineBasicBlock *MBB = MI->getParent();
1830bdd1243dSDimitry Andric // Insert a full wait count because found register might be pending a wait.
1831bdd1243dSDimitry Andric BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1832bdd1243dSDimitry Andric .addImm(0);
1833bdd1243dSDimitry Andric
1834bdd1243dSDimitry Andric // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1835bdd1243dSDimitry Andric if (Overlapped)
1836bdd1243dSDimitry Andric runOnInstruction(
1837bdd1243dSDimitry Andric BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1838bdd1243dSDimitry Andric .addDef(AmtReg - 1)
1839bdd1243dSDimitry Andric .addReg(AmtReg - 1, RegState::Undef)
1840bdd1243dSDimitry Andric .addReg(NewAmtLo, RegState::Undef));
1841bdd1243dSDimitry Andric runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1842bdd1243dSDimitry Andric .addDef(AmtReg)
1843bdd1243dSDimitry Andric .addReg(AmtReg, RegState::Undef)
1844bdd1243dSDimitry Andric .addReg(NewAmt, RegState::Undef));
1845bdd1243dSDimitry Andric
1846bdd1243dSDimitry Andric // Instructions emitted after the current instruction will be processed by the
1847bdd1243dSDimitry Andric // parent loop of the hazard recognizer in a natural way.
1848bdd1243dSDimitry Andric BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1849bdd1243dSDimitry Andric AmtReg)
1850bdd1243dSDimitry Andric .addDef(NewAmt)
1851bdd1243dSDimitry Andric .addReg(NewAmt)
1852bdd1243dSDimitry Andric .addReg(AmtReg);
1853bdd1243dSDimitry Andric if (Overlapped)
1854bdd1243dSDimitry Andric BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1855bdd1243dSDimitry Andric AmtReg - 1)
1856bdd1243dSDimitry Andric .addDef(NewAmtLo)
1857bdd1243dSDimitry Andric .addReg(NewAmtLo)
1858bdd1243dSDimitry Andric .addReg(AmtReg - 1);
1859bdd1243dSDimitry Andric
1860bdd1243dSDimitry Andric // Re-running hazard recognizer on the modified instruction is not necessary,
1861bdd1243dSDimitry Andric // inserted V_SWAP_B32 has already both read and write new registers so
1862bdd1243dSDimitry Andric // hazards related to these register has already been handled.
1863bdd1243dSDimitry Andric Amt->setReg(NewAmt);
1864bdd1243dSDimitry Andric Amt->setIsKill(false);
1865bdd1243dSDimitry Andric // We do not update liveness, so verifier may see it as undef.
1866bdd1243dSDimitry Andric Amt->setIsUndef();
1867bdd1243dSDimitry Andric if (OverlappedDst)
1868bdd1243dSDimitry Andric MI->getOperand(0).setReg(NewReg);
1869bdd1243dSDimitry Andric if (OverlappedSrc) {
1870bdd1243dSDimitry Andric Src1->setReg(NewReg);
1871bdd1243dSDimitry Andric Src1->setIsKill(false);
1872bdd1243dSDimitry Andric Src1->setIsUndef();
1873bdd1243dSDimitry Andric }
1874bdd1243dSDimitry Andric
1875bdd1243dSDimitry Andric return true;
1876bdd1243dSDimitry Andric }
1877bdd1243dSDimitry Andric
checkNSAtoVMEMHazard(MachineInstr * MI)18780b57cec5SDimitry Andric int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
18790b57cec5SDimitry Andric int NSAtoVMEMWaitStates = 1;
18800b57cec5SDimitry Andric
18810b57cec5SDimitry Andric if (!ST.hasNSAtoVMEMBug())
18820b57cec5SDimitry Andric return 0;
18830b57cec5SDimitry Andric
18840b57cec5SDimitry Andric if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
18850b57cec5SDimitry Andric return 0;
18860b57cec5SDimitry Andric
18870b57cec5SDimitry Andric const SIInstrInfo *TII = ST.getInstrInfo();
18880b57cec5SDimitry Andric const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
18890b57cec5SDimitry Andric if (!Offset || (Offset->getImm() & 6) == 0)
18900b57cec5SDimitry Andric return 0;
18910b57cec5SDimitry Andric
1892fe6060f1SDimitry Andric auto IsHazardFn = [TII](const MachineInstr &I) {
1893fe6060f1SDimitry Andric if (!SIInstrInfo::isMIMG(I))
18940b57cec5SDimitry Andric return false;
1895fe6060f1SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
18960b57cec5SDimitry Andric return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1897fe6060f1SDimitry Andric TII->getInstSizeInBytes(I) >= 16;
18980b57cec5SDimitry Andric };
18990b57cec5SDimitry Andric
19000b57cec5SDimitry Andric return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
19010b57cec5SDimitry Andric }
19020b57cec5SDimitry Andric
checkFPAtomicToDenormModeHazard(MachineInstr * MI)19030b57cec5SDimitry Andric int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
19040b57cec5SDimitry Andric int FPAtomicToDenormModeWaitStates = 3;
19050b57cec5SDimitry Andric
1906bdd1243dSDimitry Andric if (!ST.hasFPAtomicToDenormModeHazard())
1907bdd1243dSDimitry Andric return 0;
19087a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
1909bdd1243dSDimitry Andric
19100b57cec5SDimitry Andric if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
19110b57cec5SDimitry Andric return 0;
19120b57cec5SDimitry Andric
1913fe6060f1SDimitry Andric auto IsHazardFn = [](const MachineInstr &I) {
1914fe6060f1SDimitry Andric if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
19150b57cec5SDimitry Andric return false;
1916fe6060f1SDimitry Andric return SIInstrInfo::isFPAtomic(I);
19170b57cec5SDimitry Andric };
19180b57cec5SDimitry Andric
1919fe6060f1SDimitry Andric auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1920fe6060f1SDimitry Andric if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
19210b57cec5SDimitry Andric return true;
19220b57cec5SDimitry Andric
1923fe6060f1SDimitry Andric switch (MI.getOpcode()) {
19240b57cec5SDimitry Andric case AMDGPU::S_WAITCNT:
19250b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VSCNT:
19260b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_VMCNT:
19270b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_EXPCNT:
19280b57cec5SDimitry Andric case AMDGPU::S_WAITCNT_LGKMCNT:
1929e8d8bef9SDimitry Andric case AMDGPU::S_WAIT_IDLE:
19300b57cec5SDimitry Andric return true;
19310b57cec5SDimitry Andric default:
19320b57cec5SDimitry Andric break;
19330b57cec5SDimitry Andric }
19340b57cec5SDimitry Andric
19350b57cec5SDimitry Andric return false;
19360b57cec5SDimitry Andric };
19370b57cec5SDimitry Andric
19380b57cec5SDimitry Andric return FPAtomicToDenormModeWaitStates -
19390b57cec5SDimitry Andric ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
19400b57cec5SDimitry Andric }
19410b57cec5SDimitry Andric
checkMAIHazards(MachineInstr * MI)19420b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
19430b57cec5SDimitry Andric assert(SIInstrInfo::isMAI(*MI));
19440b57cec5SDimitry Andric
1945fe6060f1SDimitry Andric return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1946fe6060f1SDimitry Andric }
1947fe6060f1SDimitry Andric
checkMFMAPadding(MachineInstr * MI)194881ad6265SDimitry Andric int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
194981ad6265SDimitry Andric // Early exit if no padding is requested.
195081ad6265SDimitry Andric if (MFMAPaddingRatio == 0)
195181ad6265SDimitry Andric return 0;
195281ad6265SDimitry Andric
195381ad6265SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
195481ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
195581ad6265SDimitry Andric return 0;
195681ad6265SDimitry Andric
195781ad6265SDimitry Andric int NeighborMFMALatency = 0;
195881ad6265SDimitry Andric auto IsNeighboringMFMA = [&NeighborMFMALatency,
195981ad6265SDimitry Andric this](const MachineInstr &MI) {
196081ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI))
196181ad6265SDimitry Andric return false;
196281ad6265SDimitry Andric
196381ad6265SDimitry Andric NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
196481ad6265SDimitry Andric return true;
196581ad6265SDimitry Andric };
196681ad6265SDimitry Andric
196781ad6265SDimitry Andric const int MaxMFMAPipelineWaitStates = 16;
196881ad6265SDimitry Andric int WaitStatesSinceNeighborMFMA =
196981ad6265SDimitry Andric getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
197081ad6265SDimitry Andric
197181ad6265SDimitry Andric int NeighborMFMAPaddingNeeded =
197281ad6265SDimitry Andric (NeighborMFMALatency * MFMAPaddingRatio / 100) -
197381ad6265SDimitry Andric WaitStatesSinceNeighborMFMA;
197481ad6265SDimitry Andric
197581ad6265SDimitry Andric return std::max(0, NeighborMFMAPaddingNeeded);
197681ad6265SDimitry Andric }
197781ad6265SDimitry Andric
checkMAIHazards908(MachineInstr * MI)1978fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
19790b57cec5SDimitry Andric int WaitStatesNeeded = 0;
19800b57cec5SDimitry Andric unsigned Opc = MI->getOpcode();
19810b57cec5SDimitry Andric
1982fe6060f1SDimitry Andric auto IsVALUFn = [](const MachineInstr &MI) {
1983bdd1243dSDimitry Andric return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
19840b57cec5SDimitry Andric };
19850b57cec5SDimitry Andric
1986e8d8bef9SDimitry Andric if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
19870b57cec5SDimitry Andric const int LegacyVALUWritesVGPRWaitStates = 2;
19880b57cec5SDimitry Andric const int VALUWritesExecWaitStates = 4;
19890b57cec5SDimitry Andric const int MaxWaitStates = 4;
19900b57cec5SDimitry Andric
19910b57cec5SDimitry Andric int WaitStatesNeededForUse = VALUWritesExecWaitStates -
19920b57cec5SDimitry Andric getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
19930b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
19940b57cec5SDimitry Andric
19950b57cec5SDimitry Andric if (WaitStatesNeeded < MaxWaitStates) {
19960b57cec5SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) {
19970b57cec5SDimitry Andric const int MaxWaitStates = 2;
19980b57cec5SDimitry Andric
19990b57cec5SDimitry Andric if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
20000b57cec5SDimitry Andric continue;
20010b57cec5SDimitry Andric
20020b57cec5SDimitry Andric int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
20030b57cec5SDimitry Andric getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
20040b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20050b57cec5SDimitry Andric
20060b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
20070b57cec5SDimitry Andric break;
20080b57cec5SDimitry Andric }
20090b57cec5SDimitry Andric }
20100b57cec5SDimitry Andric }
20110b57cec5SDimitry Andric
20120b57cec5SDimitry Andric for (const MachineOperand &Op : MI->explicit_operands()) {
20130b57cec5SDimitry Andric if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
20140b57cec5SDimitry Andric continue;
20150b57cec5SDimitry Andric
2016e8d8bef9SDimitry Andric if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
20170b57cec5SDimitry Andric continue;
20180b57cec5SDimitry Andric
20190b57cec5SDimitry Andric const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
20200b57cec5SDimitry Andric const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
20210b57cec5SDimitry Andric const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
20220b57cec5SDimitry Andric const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
20230b57cec5SDimitry Andric const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
20240b57cec5SDimitry Andric const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
20250b57cec5SDimitry Andric const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
20260b57cec5SDimitry Andric const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
20270b57cec5SDimitry Andric const int MaxWaitStates = 18;
20288bcb0991SDimitry Andric Register Reg = Op.getReg();
20290b57cec5SDimitry Andric unsigned HazardDefLatency = 0;
20300b57cec5SDimitry Andric
203181ad6265SDimitry Andric auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2032fe6060f1SDimitry Andric this](const MachineInstr &MI) {
203381ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI))
20340b57cec5SDimitry Andric return false;
2035fe6060f1SDimitry Andric Register DstReg = MI.getOperand(0).getReg();
20360b57cec5SDimitry Andric if (DstReg == Reg)
20370b57cec5SDimitry Andric return false;
2038fe6060f1SDimitry Andric HazardDefLatency =
2039fe6060f1SDimitry Andric std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
20400b57cec5SDimitry Andric return TRI.regsOverlap(DstReg, Reg);
20410b57cec5SDimitry Andric };
20420b57cec5SDimitry Andric
20430b57cec5SDimitry Andric int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
20440b57cec5SDimitry Andric MaxWaitStates);
20450b57cec5SDimitry Andric int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
20460b57cec5SDimitry Andric int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
204706c3fb27SDimitry Andric int OpNo = Op.getOperandNo();
20480b57cec5SDimitry Andric if (OpNo == SrcCIdx) {
20490b57cec5SDimitry Andric NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2050e8d8bef9SDimitry Andric } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
20510b57cec5SDimitry Andric switch (HazardDefLatency) {
20520b57cec5SDimitry Andric case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
20530b57cec5SDimitry Andric break;
20540b57cec5SDimitry Andric case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
20550b57cec5SDimitry Andric break;
2056bdd1243dSDimitry Andric case 16: [[fallthrough]];
20570b57cec5SDimitry Andric default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
20580b57cec5SDimitry Andric break;
20590b57cec5SDimitry Andric }
2060e8d8bef9SDimitry Andric } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
20610b57cec5SDimitry Andric switch (HazardDefLatency) {
20620b57cec5SDimitry Andric case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
20630b57cec5SDimitry Andric break;
20640b57cec5SDimitry Andric case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
20650b57cec5SDimitry Andric break;
2066bdd1243dSDimitry Andric case 16: [[fallthrough]];
20670b57cec5SDimitry Andric default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
20680b57cec5SDimitry Andric break;
20690b57cec5SDimitry Andric }
20700b57cec5SDimitry Andric }
20710b57cec5SDimitry Andric
20720b57cec5SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
20730b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20740b57cec5SDimitry Andric
20750b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
20760b57cec5SDimitry Andric return WaitStatesNeeded; // Early exit.
20770b57cec5SDimitry Andric
2078fe6060f1SDimitry Andric auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2079fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
20800b57cec5SDimitry Andric return false;
2081fe6060f1SDimitry Andric Register DstReg = MI.getOperand(0).getReg();
20820b57cec5SDimitry Andric return TRI.regsOverlap(Reg, DstReg);
20830b57cec5SDimitry Andric };
20840b57cec5SDimitry Andric
20850b57cec5SDimitry Andric const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
20860b57cec5SDimitry Andric const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
20870b57cec5SDimitry Andric const int AccVGPRWriteAccVgprReadWaitStates = 3;
20880b57cec5SDimitry Andric NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
20890b57cec5SDimitry Andric if (OpNo == SrcCIdx)
20900b57cec5SDimitry Andric NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2091e8d8bef9SDimitry Andric else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
20920b57cec5SDimitry Andric NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
20930b57cec5SDimitry Andric
20940b57cec5SDimitry Andric WaitStatesNeededForUse = NeedWaitStates -
20950b57cec5SDimitry Andric getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
20960b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
20970b57cec5SDimitry Andric
20980b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
20990b57cec5SDimitry Andric return WaitStatesNeeded; // Early exit.
21000b57cec5SDimitry Andric }
21010b57cec5SDimitry Andric
2102e8d8bef9SDimitry Andric if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
21030b57cec5SDimitry Andric const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
21040b57cec5SDimitry Andric const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
21050b57cec5SDimitry Andric const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
21060b57cec5SDimitry Andric const int MaxWaitStates = 13;
21078bcb0991SDimitry Andric Register DstReg = MI->getOperand(0).getReg();
21080b57cec5SDimitry Andric unsigned HazardDefLatency = 0;
21090b57cec5SDimitry Andric
211081ad6265SDimitry Andric auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2111fe6060f1SDimitry Andric this](const MachineInstr &MI) {
211281ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI))
21130b57cec5SDimitry Andric return false;
2114fe6060f1SDimitry Andric Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2115fe6060f1SDimitry Andric HazardDefLatency =
2116fe6060f1SDimitry Andric std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
21170b57cec5SDimitry Andric return TRI.regsOverlap(Reg, DstReg);
21180b57cec5SDimitry Andric };
21190b57cec5SDimitry Andric
21200b57cec5SDimitry Andric int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
21210b57cec5SDimitry Andric int NeedWaitStates;
21220b57cec5SDimitry Andric switch (HazardDefLatency) {
21230b57cec5SDimitry Andric case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
21240b57cec5SDimitry Andric break;
21250b57cec5SDimitry Andric case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
21260b57cec5SDimitry Andric break;
2127bdd1243dSDimitry Andric case 16: [[fallthrough]];
21280b57cec5SDimitry Andric default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
21290b57cec5SDimitry Andric break;
21300b57cec5SDimitry Andric }
21310b57cec5SDimitry Andric
21320b57cec5SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
21330b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
21340b57cec5SDimitry Andric }
21350b57cec5SDimitry Andric
213681ad6265SDimitry Andric // Pad neighboring MFMA with noops for better inter-wave performance.
213781ad6265SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
213881ad6265SDimitry Andric
21390b57cec5SDimitry Andric return WaitStatesNeeded;
21400b57cec5SDimitry Andric }
21410b57cec5SDimitry Andric
2142*0fca6ea1SDimitry Andric static int
GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)2143*0fca6ea1SDimitry Andric GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2144*0fca6ea1SDimitry Andric // 2 pass -> 3
2145*0fca6ea1SDimitry Andric // 4 pass -> 5
2146*0fca6ea1SDimitry Andric // 8 pass -> 9
2147*0fca6ea1SDimitry Andric // 16 pass -> 17
2148*0fca6ea1SDimitry Andric return NumPasses + 1;
2149*0fca6ea1SDimitry Andric }
2150*0fca6ea1SDimitry Andric
2151*0fca6ea1SDimitry Andric static int
GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)2152*0fca6ea1SDimitry Andric GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2153*0fca6ea1SDimitry Andric // 2 pass -> 2
2154*0fca6ea1SDimitry Andric // 4 pass -> 4
2155*0fca6ea1SDimitry Andric // 8 pass -> 8
2156*0fca6ea1SDimitry Andric // 16 pass -> 16
2157*0fca6ea1SDimitry Andric return NumPasses;
2158*0fca6ea1SDimitry Andric }
2159*0fca6ea1SDimitry Andric
2160*0fca6ea1SDimitry Andric static int
GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)2161*0fca6ea1SDimitry Andric GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2162*0fca6ea1SDimitry Andric // 2 pass -> 4
2163*0fca6ea1SDimitry Andric // 4 pass -> 6
2164*0fca6ea1SDimitry Andric // 8 pass -> 10
2165*0fca6ea1SDimitry Andric // 16 pass -> 18
2166*0fca6ea1SDimitry Andric return NumPasses + 2;
2167*0fca6ea1SDimitry Andric }
2168*0fca6ea1SDimitry Andric
GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)2169*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2170*0fca6ea1SDimitry Andric // 2 pass -> 5
2171*0fca6ea1SDimitry Andric // 4 pass -> 7
2172*0fca6ea1SDimitry Andric // 8 pass -> 11
2173*0fca6ea1SDimitry Andric // 16 pass -> 19
2174*0fca6ea1SDimitry Andric return NumPasses + 3;
2175*0fca6ea1SDimitry Andric }
2176*0fca6ea1SDimitry Andric
checkMAIHazards90A(MachineInstr * MI)2177fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2178fe6060f1SDimitry Andric int WaitStatesNeeded = 0;
2179fe6060f1SDimitry Andric unsigned Opc = MI->getOpcode();
2180fe6060f1SDimitry Andric
218181ad6265SDimitry Andric auto IsLegacyVALUFn = [](const MachineInstr &MI) {
218281ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2183fe6060f1SDimitry Andric };
2184fe6060f1SDimitry Andric
218581ad6265SDimitry Andric auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
218681ad6265SDimitry Andric return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
218781ad6265SDimitry Andric !SIInstrInfo::isDOT(MI);
2188fe6060f1SDimitry Andric };
2189fe6060f1SDimitry Andric
219081ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(*MI))
2191fe6060f1SDimitry Andric return WaitStatesNeeded;
2192fe6060f1SDimitry Andric
2193fe6060f1SDimitry Andric const int VALUWritesExecWaitStates = 4;
2194fe6060f1SDimitry Andric int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2195fe6060f1SDimitry Andric getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2196fe6060f1SDimitry Andric VALUWritesExecWaitStates);
2197fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2198fe6060f1SDimitry Andric
2199fe6060f1SDimitry Andric int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2200fe6060f1SDimitry Andric
2201fe6060f1SDimitry Andric // Loop for both DGEMM and S/HGEMM 2nd instruction.
2202fe6060f1SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) {
2203fe6060f1SDimitry Andric const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2204fe6060f1SDimitry Andric const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2205fe6060f1SDimitry Andric const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2206fe6060f1SDimitry Andric const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2207fe6060f1SDimitry Andric const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2208fe6060f1SDimitry Andric const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2209fe6060f1SDimitry Andric const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2210fe6060f1SDimitry Andric const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2211fe6060f1SDimitry Andric const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2212fe6060f1SDimitry Andric const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2213fe6060f1SDimitry Andric const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2214fe6060f1SDimitry Andric const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2215fe6060f1SDimitry Andric const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2216fe6060f1SDimitry Andric const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2217fe6060f1SDimitry Andric const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
221881ad6265SDimitry Andric const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2219fe6060f1SDimitry Andric const int MaxWaitStates = 19;
2220fe6060f1SDimitry Andric
2221fe6060f1SDimitry Andric if (!Use.isReg())
2222fe6060f1SDimitry Andric continue;
222304eeddc0SDimitry Andric Register Reg = Use.getReg();
2224fe6060f1SDimitry Andric bool FullReg;
2225fe6060f1SDimitry Andric const MachineInstr *MI1;
2226fe6060f1SDimitry Andric
222781ad6265SDimitry Andric auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2228fe6060f1SDimitry Andric this](const MachineInstr &MI) {
222981ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI))
2230fe6060f1SDimitry Andric return false;
2231fe6060f1SDimitry Andric Register DstReg = MI.getOperand(0).getReg();
2232fe6060f1SDimitry Andric FullReg = (DstReg == Reg);
2233fe6060f1SDimitry Andric MI1 = &MI;
2234fe6060f1SDimitry Andric return TRI.regsOverlap(DstReg, Reg);
2235fe6060f1SDimitry Andric };
2236fe6060f1SDimitry Andric
2237fe6060f1SDimitry Andric WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2238fe6060f1SDimitry Andric getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2239fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2240fe6060f1SDimitry Andric
22414824e7fdSDimitry Andric int NumWaitStates =
22424824e7fdSDimitry Andric getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2243fe6060f1SDimitry Andric if (NumWaitStates == std::numeric_limits<int>::max())
2244fe6060f1SDimitry Andric continue;
2245fe6060f1SDimitry Andric
224606c3fb27SDimitry Andric int OpNo = Use.getOperandNo();
2247fe6060f1SDimitry Andric unsigned Opc1 = MI1->getOpcode();
2248fe6060f1SDimitry Andric int NeedWaitStates = 0;
2249fe6060f1SDimitry Andric if (OpNo == SrcCIdx) {
225081ad6265SDimitry Andric if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2251fe6060f1SDimitry Andric NeedWaitStates = 0;
2252fe6060f1SDimitry Andric } else if (FullReg) {
2253fe6060f1SDimitry Andric if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254fe6060f1SDimitry Andric Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2255fe6060f1SDimitry Andric (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2256fe6060f1SDimitry Andric Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2257fe6060f1SDimitry Andric NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
225881ad6265SDimitry Andric else if (ST.hasGFX940Insts() &&
225981ad6265SDimitry Andric TSchedModel.computeInstrLatency(MI1) == 2)
226081ad6265SDimitry Andric NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2261fe6060f1SDimitry Andric } else {
2262fe6060f1SDimitry Andric switch (Opc1) {
2263fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2264fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
226504eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
226604eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2267fe6060f1SDimitry Andric if (!isXDL(ST, *MI))
2268fe6060f1SDimitry Andric NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2269fe6060f1SDimitry Andric break;
2270fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2271fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2272fe6060f1SDimitry Andric if (!isXDL(ST, *MI))
2273fe6060f1SDimitry Andric NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2274fe6060f1SDimitry Andric break;
2275fe6060f1SDimitry Andric default:
2276*0fca6ea1SDimitry Andric int NumPasses = TSchedModel.computeInstrLatency(MI1);
2277*0fca6ea1SDimitry Andric if (ST.hasGFX940Insts()) {
2278*0fca6ea1SDimitry Andric if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
227981ad6265SDimitry Andric break;
2280*0fca6ea1SDimitry Andric
2281*0fca6ea1SDimitry Andric NeedWaitStates =
2282*0fca6ea1SDimitry Andric isXDL(ST, *MI1)
2283*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2284*0fca6ea1SDimitry Andric NumPasses)
2285*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2286*0fca6ea1SDimitry Andric NumPasses);
2287*0fca6ea1SDimitry Andric break;
2288*0fca6ea1SDimitry Andric }
2289*0fca6ea1SDimitry Andric
2290*0fca6ea1SDimitry Andric switch (NumPasses) {
2291fe6060f1SDimitry Andric case 2:
2292*0fca6ea1SDimitry Andric NeedWaitStates =
2293*0fca6ea1SDimitry Andric isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294fe6060f1SDimitry Andric : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2295fe6060f1SDimitry Andric break;
2296fe6060f1SDimitry Andric case 8:
2297*0fca6ea1SDimitry Andric NeedWaitStates =
2298*0fca6ea1SDimitry Andric isDGEMM(Opc)
2299fe6060f1SDimitry Andric ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300fe6060f1SDimitry Andric : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2301fe6060f1SDimitry Andric break;
2302*0fca6ea1SDimitry Andric case 16:
2303*0fca6ea1SDimitry Andric NeedWaitStates =
2304*0fca6ea1SDimitry Andric isDGEMM(Opc)
2305fe6060f1SDimitry Andric ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306fe6060f1SDimitry Andric : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2307*0fca6ea1SDimitry Andric break;
2308*0fca6ea1SDimitry Andric default:
2309*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of passes");
2310fe6060f1SDimitry Andric }
2311fe6060f1SDimitry Andric }
2312fe6060f1SDimitry Andric }
2313fe6060f1SDimitry Andric } else {
2314fe6060f1SDimitry Andric switch (Opc1) {
2315fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2316fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
231704eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
231804eeddc0SDimitry Andric case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2319fe6060f1SDimitry Andric NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2320fe6060f1SDimitry Andric break;
2321fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2322fe6060f1SDimitry Andric case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2323fe6060f1SDimitry Andric NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2324fe6060f1SDimitry Andric break;
2325fe6060f1SDimitry Andric default:
2326*0fca6ea1SDimitry Andric int NumPasses = TSchedModel.computeInstrLatency(MI1);
2327*0fca6ea1SDimitry Andric
2328*0fca6ea1SDimitry Andric if (ST.hasGFX940Insts()) {
2329*0fca6ea1SDimitry Andric NeedWaitStates =
2330*0fca6ea1SDimitry Andric isXDL(ST, *MI1)
2331*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2332*0fca6ea1SDimitry Andric NumPasses)
2333*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2334*0fca6ea1SDimitry Andric NumPasses);
2335*0fca6ea1SDimitry Andric break;
2336*0fca6ea1SDimitry Andric }
2337*0fca6ea1SDimitry Andric
2338*0fca6ea1SDimitry Andric switch (NumPasses) {
2339fe6060f1SDimitry Andric case 2:
2340*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
234181ad6265SDimitry Andric break;
234281ad6265SDimitry Andric case 4:
2343*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of passes for mfma");
2344fe6060f1SDimitry Andric case 8:
2345*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2346fe6060f1SDimitry Andric break;
2347*0fca6ea1SDimitry Andric case 16:
2348fe6060f1SDimitry Andric default:
2349*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2350fe6060f1SDimitry Andric }
2351fe6060f1SDimitry Andric }
2352fe6060f1SDimitry Andric }
2353fe6060f1SDimitry Andric if (WaitStatesNeeded >= NeedWaitStates)
2354fe6060f1SDimitry Andric continue;
2355fe6060f1SDimitry Andric
2356fe6060f1SDimitry Andric WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2357fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2358fe6060f1SDimitry Andric
2359fe6060f1SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
2360fe6060f1SDimitry Andric break;
2361fe6060f1SDimitry Andric }
2362fe6060f1SDimitry Andric
2363*0fca6ea1SDimitry Andric // Pad neighboring MFMA with noops for better inter-wave performance.
2364*0fca6ea1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2365*0fca6ea1SDimitry Andric
2366fe6060f1SDimitry Andric return WaitStatesNeeded;
2367fe6060f1SDimitry Andric }
2368fe6060f1SDimitry Andric
checkMAILdStHazards(MachineInstr * MI)23690b57cec5SDimitry Andric int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2370349cc55cSDimitry Andric // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371fe6060f1SDimitry Andric if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
23720b57cec5SDimitry Andric return 0;
23730b57cec5SDimitry Andric
23740b57cec5SDimitry Andric int WaitStatesNeeded = 0;
23750b57cec5SDimitry Andric
2376fe6060f1SDimitry Andric auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2377fe6060f1SDimitry Andric return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
23780b57cec5SDimitry Andric };
23790b57cec5SDimitry Andric
23800b57cec5SDimitry Andric for (const MachineOperand &Op : MI->explicit_uses()) {
23810b57cec5SDimitry Andric if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
23820b57cec5SDimitry Andric continue;
23830b57cec5SDimitry Andric
23848bcb0991SDimitry Andric Register Reg = Op.getReg();
23850b57cec5SDimitry Andric
23860b57cec5SDimitry Andric const int AccVgprReadLdStWaitStates = 2;
2387e8d8bef9SDimitry Andric const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
23880b57cec5SDimitry Andric const int MaxWaitStates = 2;
23890b57cec5SDimitry Andric
23900b57cec5SDimitry Andric int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
23910b57cec5SDimitry Andric getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
23920b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
23930b57cec5SDimitry Andric
23940b57cec5SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
23950b57cec5SDimitry Andric return WaitStatesNeeded; // Early exit.
23960b57cec5SDimitry Andric
2397fe6060f1SDimitry Andric auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2398fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399fe6060f1SDimitry Andric MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
24000b57cec5SDimitry Andric return false;
2401fe6060f1SDimitry Andric auto IsVALUFn = [](const MachineInstr &MI) {
2402fe6060f1SDimitry Andric return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
24030b57cec5SDimitry Andric };
24040b57cec5SDimitry Andric return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
24050b57cec5SDimitry Andric std::numeric_limits<int>::max();
24060b57cec5SDimitry Andric };
24070b57cec5SDimitry Andric
2408e8d8bef9SDimitry Andric WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409e8d8bef9SDimitry Andric getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
24100b57cec5SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
24110b57cec5SDimitry Andric }
24120b57cec5SDimitry Andric
24130b57cec5SDimitry Andric return WaitStatesNeeded;
24140b57cec5SDimitry Andric }
2415e8d8bef9SDimitry Andric
GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)2416*0fca6ea1SDimitry Andric static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2417*0fca6ea1SDimitry Andric // 2 pass -> 4
2418*0fca6ea1SDimitry Andric // 4 pass -> 6
2419*0fca6ea1SDimitry Andric // 8 pass -> 10
2420*0fca6ea1SDimitry Andric // 16 pass -> 18
2421*0fca6ea1SDimitry Andric return NumPasses + 2;
2422*0fca6ea1SDimitry Andric }
2423*0fca6ea1SDimitry Andric
GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)2424*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2425*0fca6ea1SDimitry Andric // 2 pass -> 5
2426*0fca6ea1SDimitry Andric // 4 pass -> 7
2427*0fca6ea1SDimitry Andric // 8 pass -> 11
2428*0fca6ea1SDimitry Andric // 16 pass -> 19
2429*0fca6ea1SDimitry Andric return NumPasses + 3;
2430*0fca6ea1SDimitry Andric }
2431*0fca6ea1SDimitry Andric
GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)2432*0fca6ea1SDimitry Andric static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2433*0fca6ea1SDimitry Andric // 2 pass -> 5
2434*0fca6ea1SDimitry Andric // 4 pass -> 7
2435*0fca6ea1SDimitry Andric // 8 pass -> 11
2436*0fca6ea1SDimitry Andric // 16 pass -> 19
2437*0fca6ea1SDimitry Andric return NumPasses + 3;
2438*0fca6ea1SDimitry Andric }
2439*0fca6ea1SDimitry Andric
GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)2440*0fca6ea1SDimitry Andric static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2441*0fca6ea1SDimitry Andric // 2 pass -> 4
2442*0fca6ea1SDimitry Andric // 4 pass -> 6
2443*0fca6ea1SDimitry Andric // 8 pass -> 10
2444*0fca6ea1SDimitry Andric // 16 pass -> 18
2445*0fca6ea1SDimitry Andric return NumPasses + 2;
2446*0fca6ea1SDimitry Andric }
2447*0fca6ea1SDimitry Andric
checkMAIVALUHazards(MachineInstr * MI)2448fe6060f1SDimitry Andric int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2449fe6060f1SDimitry Andric if (!ST.hasGFX90AInsts())
2450fe6060f1SDimitry Andric return 0;
2451fe6060f1SDimitry Andric
2452fe6060f1SDimitry Andric auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2453fe6060f1SDimitry Andric return isDGEMM(MI.getOpcode());
2454fe6060f1SDimitry Andric };
2455fe6060f1SDimitry Andric
2456fe6060f1SDimitry Andric // This is checked in checkMAIHazards90A()
245781ad6265SDimitry Andric if (SIInstrInfo::isMFMA(*MI))
2458fe6060f1SDimitry Andric return 0;
2459fe6060f1SDimitry Andric
2460bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
2461bdd1243dSDimitry Andric
2462fe6060f1SDimitry Andric int WaitStatesNeeded = 0;
2463fe6060f1SDimitry Andric
2464bdd1243dSDimitry Andric bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2465fe6060f1SDimitry Andric SIInstrInfo::isFLAT(*MI) ||
2466bdd1243dSDimitry Andric SIInstrInfo::isDS(*MI);
2467bdd1243dSDimitry Andric bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2468fe6060f1SDimitry Andric bool IsVALU = SIInstrInfo::isVALU(*MI);
2469fe6060f1SDimitry Andric
2470fe6060f1SDimitry Andric const MachineInstr *MFMA = nullptr;
2471fe6060f1SDimitry Andric unsigned Reg;
247281ad6265SDimitry Andric auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
247381ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI) ||
247481ad6265SDimitry Andric !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2475fe6060f1SDimitry Andric return false;
2476fe6060f1SDimitry Andric MFMA = &MI;
2477fe6060f1SDimitry Andric return true;
2478fe6060f1SDimitry Andric };
2479fe6060f1SDimitry Andric
2480fe6060f1SDimitry Andric const MachineInstr *DOT = nullptr;
2481fe6060f1SDimitry Andric auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2482fe6060f1SDimitry Andric if (!SIInstrInfo::isDOT(MI) ||
2483fe6060f1SDimitry Andric !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2484fe6060f1SDimitry Andric return false;
2485fe6060f1SDimitry Andric DOT = &MI;
2486fe6060f1SDimitry Andric return true;
2487fe6060f1SDimitry Andric };
2488fe6060f1SDimitry Andric
2489bdd1243dSDimitry Andric bool DGEMMAfterVALUWrite = false;
2490bdd1243dSDimitry Andric auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2491bdd1243dSDimitry Andric // Found DGEMM on reverse traversal to def.
2492bdd1243dSDimitry Andric if (isDGEMM(MI.getOpcode()))
2493bdd1243dSDimitry Andric DGEMMAfterVALUWrite = true;
2494bdd1243dSDimitry Andric
2495bdd1243dSDimitry Andric // Only hazard if register is defined by a VALU and a DGEMM is found after
2496bdd1243dSDimitry Andric // after the def.
2497bdd1243dSDimitry Andric if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2498bdd1243dSDimitry Andric return false;
2499bdd1243dSDimitry Andric
2500bdd1243dSDimitry Andric return true;
2501bdd1243dSDimitry Andric };
2502bdd1243dSDimitry Andric
2503fe6060f1SDimitry Andric int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2504fe6060f1SDimitry Andric AMDGPU::OpName::src2);
2505fe6060f1SDimitry Andric
2506fe6060f1SDimitry Andric if (IsMemOrExport || IsVALU) {
2507fe6060f1SDimitry Andric const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2508fe6060f1SDimitry Andric const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2509fe6060f1SDimitry Andric const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2510fe6060f1SDimitry Andric const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2511fe6060f1SDimitry Andric const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2512fe6060f1SDimitry Andric const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2513fe6060f1SDimitry Andric const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2514fe6060f1SDimitry Andric const int DotWriteSameDotReadSrcAB = 3;
2515fe6060f1SDimitry Andric const int DotWriteDifferentVALURead = 3;
2516bdd1243dSDimitry Andric const int DMFMABetweenVALUWriteVMEMRead = 2;
2517fe6060f1SDimitry Andric const int MaxWaitStates = 19;
2518fe6060f1SDimitry Andric
2519fe6060f1SDimitry Andric for (const MachineOperand &Use : MI->explicit_uses()) {
2520fe6060f1SDimitry Andric if (!Use.isReg())
2521fe6060f1SDimitry Andric continue;
2522fe6060f1SDimitry Andric Reg = Use.getReg();
2523fe6060f1SDimitry Andric
2524fe6060f1SDimitry Andric DOT = nullptr;
2525fe6060f1SDimitry Andric int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2526fe6060f1SDimitry Andric MaxWaitStates);
2527fe6060f1SDimitry Andric if (DOT) {
2528fe6060f1SDimitry Andric int NeedWaitStates = 0;
2529fe6060f1SDimitry Andric if (DOT->getOpcode() == MI->getOpcode()) {
2530fe6060f1SDimitry Andric if (&Use - &MI->getOperand(0) != SrcCIdx)
2531fe6060f1SDimitry Andric NeedWaitStates = DotWriteSameDotReadSrcAB;
2532fe6060f1SDimitry Andric } else {
2533fe6060f1SDimitry Andric NeedWaitStates = DotWriteDifferentVALURead;
2534fe6060f1SDimitry Andric }
2535fe6060f1SDimitry Andric
2536fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2537fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2538fe6060f1SDimitry Andric }
2539fe6060f1SDimitry Andric
2540bdd1243dSDimitry Andric // Workaround for HW data hazard bug observed only in GFX90A. When there
2541bdd1243dSDimitry Andric // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2542bdd1243dSDimitry Andric // causes the SQ to incorrectly not insert two wait states between the two
2543bdd1243dSDimitry Andric // instructions needed to avoid data hazard.
2544bdd1243dSDimitry Andric if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2545bdd1243dSDimitry Andric DGEMMAfterVALUWrite = false;
2546bdd1243dSDimitry Andric if (TRI.isVectorRegister(MRI, Reg)) {
2547bdd1243dSDimitry Andric int WaitStatesNeededForUse =
2548bdd1243dSDimitry Andric DMFMABetweenVALUWriteVMEMRead -
2549bdd1243dSDimitry Andric getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2550bdd1243dSDimitry Andric DMFMABetweenVALUWriteVMEMRead);
2551bdd1243dSDimitry Andric
2552bdd1243dSDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553bdd1243dSDimitry Andric }
2554bdd1243dSDimitry Andric }
2555bdd1243dSDimitry Andric
2556fe6060f1SDimitry Andric MFMA = nullptr;
25574824e7fdSDimitry Andric WaitStatesSinceDef =
25584824e7fdSDimitry Andric getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2559fe6060f1SDimitry Andric if (!MFMA)
2560fe6060f1SDimitry Andric continue;
2561fe6060f1SDimitry Andric
2562fe6060f1SDimitry Andric unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2563*0fca6ea1SDimitry Andric int NumPasses = HazardDefLatency;
2564fe6060f1SDimitry Andric int NeedWaitStates = MaxWaitStates;
2565*0fca6ea1SDimitry Andric
2566*0fca6ea1SDimitry Andric if (isDGEMM(MFMA->getOpcode())) {
2567fe6060f1SDimitry Andric switch (HazardDefLatency) {
2568fe6060f1SDimitry Andric case 4:
2569*0fca6ea1SDimitry Andric NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2570*0fca6ea1SDimitry Andric : DMFMA4x4WriteVgprVALUReadWaitStates;
2571fe6060f1SDimitry Andric break;
2572fe6060f1SDimitry Andric case 8:
2573*0fca6ea1SDimitry Andric case 16:
2574*0fca6ea1SDimitry Andric NeedWaitStates = IsMemOrExport
2575*0fca6ea1SDimitry Andric ? DMFMA16x16WriteVgprMemExpReadWaitStates
2576*0fca6ea1SDimitry Andric : DMFMA16x16WriteVgprVALUReadWaitStates;
2577fe6060f1SDimitry Andric break;
2578fe6060f1SDimitry Andric default:
2579*0fca6ea1SDimitry Andric llvm_unreachable("unexpected dgemm");
2580*0fca6ea1SDimitry Andric }
2581*0fca6ea1SDimitry Andric } else if (ST.hasGFX940Insts()) {
2582fe6060f1SDimitry Andric NeedWaitStates =
2583*0fca6ea1SDimitry Andric isXDL(ST, *MFMA)
2584*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2585*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2586*0fca6ea1SDimitry Andric NumPasses);
2587*0fca6ea1SDimitry Andric } else {
2588*0fca6ea1SDimitry Andric switch (HazardDefLatency) {
2589*0fca6ea1SDimitry Andric case 2:
2590*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2591fe6060f1SDimitry Andric break;
2592*0fca6ea1SDimitry Andric case 8:
2593*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2594*0fca6ea1SDimitry Andric break;
2595*0fca6ea1SDimitry Andric case 16:
2596*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2597*0fca6ea1SDimitry Andric break;
2598*0fca6ea1SDimitry Andric default:
2599*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of passes for mfma");
2600*0fca6ea1SDimitry Andric }
2601fe6060f1SDimitry Andric }
2602fe6060f1SDimitry Andric
2603fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2604fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2605fe6060f1SDimitry Andric
2606fe6060f1SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
2607fe6060f1SDimitry Andric break;
2608fe6060f1SDimitry Andric }
2609fe6060f1SDimitry Andric }
2610fe6060f1SDimitry Andric
2611fe6060f1SDimitry Andric unsigned Opc = MI->getOpcode();
2612fe6060f1SDimitry Andric const int DMFMAToFMA64WaitStates = 2;
2613fe6060f1SDimitry Andric if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2614fe6060f1SDimitry Andric Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2615fe6060f1SDimitry Andric Opc == AMDGPU::V_FMAC_F64_dpp) &&
2616fe6060f1SDimitry Andric WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2617fe6060f1SDimitry Andric int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2618fe6060f1SDimitry Andric getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2619fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2620fe6060f1SDimitry Andric }
2621fe6060f1SDimitry Andric
2622fe6060f1SDimitry Andric if (!IsVALU && !IsMemOrExport)
2623fe6060f1SDimitry Andric return WaitStatesNeeded;
2624fe6060f1SDimitry Andric
2625fe6060f1SDimitry Andric for (const MachineOperand &Def : MI->defs()) {
2626fe6060f1SDimitry Andric const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2627fe6060f1SDimitry Andric const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2628fe6060f1SDimitry Andric const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2629fe6060f1SDimitry Andric const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
263081ad6265SDimitry Andric const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2631fe6060f1SDimitry Andric const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2632fe6060f1SDimitry Andric const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2633fe6060f1SDimitry Andric const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2634fe6060f1SDimitry Andric const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2635fe6060f1SDimitry Andric const int DotWriteDifferentVALUWrite = 3;
2636fe6060f1SDimitry Andric const int MaxWaitStates = 19;
2637fe6060f1SDimitry Andric const int MaxWarWaitStates = 15;
2638fe6060f1SDimitry Andric
2639fe6060f1SDimitry Andric Reg = Def.getReg();
2640fe6060f1SDimitry Andric
2641fe6060f1SDimitry Andric DOT = nullptr;
2642fe6060f1SDimitry Andric int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2643fe6060f1SDimitry Andric MaxWaitStates);
2644fe6060f1SDimitry Andric if (DOT && DOT->getOpcode() != MI->getOpcode())
2645fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2646fe6060f1SDimitry Andric WaitStatesSinceDef);
2647fe6060f1SDimitry Andric
2648fe6060f1SDimitry Andric MFMA = nullptr;
26494824e7fdSDimitry Andric WaitStatesSinceDef =
26504824e7fdSDimitry Andric getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2651fe6060f1SDimitry Andric if (MFMA) {
2652fe6060f1SDimitry Andric int NeedWaitStates = MaxWaitStates;
2653*0fca6ea1SDimitry Andric int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2654*0fca6ea1SDimitry Andric
2655*0fca6ea1SDimitry Andric if (isDGEMM(MFMA->getOpcode())) {
2656*0fca6ea1SDimitry Andric switch (NumPasses) {
2657fe6060f1SDimitry Andric case 4:
2658*0fca6ea1SDimitry Andric NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2659fe6060f1SDimitry Andric break;
2660fe6060f1SDimitry Andric case 8:
2661*0fca6ea1SDimitry Andric case 16:
2662*0fca6ea1SDimitry Andric NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2663fe6060f1SDimitry Andric break;
2664fe6060f1SDimitry Andric default:
2665*0fca6ea1SDimitry Andric llvm_unreachable("unexpected number of cycles for dgemm");
2666*0fca6ea1SDimitry Andric }
2667*0fca6ea1SDimitry Andric } else if (ST.hasGFX940Insts()) {
2668*0fca6ea1SDimitry Andric NeedWaitStates =
2669*0fca6ea1SDimitry Andric isXDL(ST, *MFMA)
2670*0fca6ea1SDimitry Andric ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2671*0fca6ea1SDimitry Andric : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2672*0fca6ea1SDimitry Andric } else {
2673*0fca6ea1SDimitry Andric switch (NumPasses) {
2674*0fca6ea1SDimitry Andric case 2:
2675*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2676fe6060f1SDimitry Andric break;
2677*0fca6ea1SDimitry Andric case 8:
2678*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2679*0fca6ea1SDimitry Andric break;
2680*0fca6ea1SDimitry Andric case 16:
2681*0fca6ea1SDimitry Andric NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2682*0fca6ea1SDimitry Andric break;
2683*0fca6ea1SDimitry Andric default:
2684*0fca6ea1SDimitry Andric llvm_unreachable("Unexpected number of passes for mfma");
2685*0fca6ea1SDimitry Andric }
2686fe6060f1SDimitry Andric }
2687fe6060f1SDimitry Andric
2688fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2689fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2690fe6060f1SDimitry Andric
2691fe6060f1SDimitry Andric if (WaitStatesNeeded == MaxWaitStates)
2692fe6060f1SDimitry Andric break;
2693fe6060f1SDimitry Andric }
2694fe6060f1SDimitry Andric
269581ad6265SDimitry Andric auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
269681ad6265SDimitry Andric if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2697fe6060f1SDimitry Andric !MI.readsRegister(Reg, &TRI))
2698fe6060f1SDimitry Andric return false;
2699fe6060f1SDimitry Andric
270081ad6265SDimitry Andric if (ST.hasGFX940Insts() && !isXDL(ST, MI))
270181ad6265SDimitry Andric return false;
270281ad6265SDimitry Andric
2703fe6060f1SDimitry Andric const MachineOperand *SrcC =
2704fe6060f1SDimitry Andric TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2705fe6060f1SDimitry Andric assert(SrcC);
2706fe6060f1SDimitry Andric if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2707fe6060f1SDimitry Andric return false;
2708fe6060f1SDimitry Andric
2709fe6060f1SDimitry Andric MFMA = &MI;
2710fe6060f1SDimitry Andric return true;
2711fe6060f1SDimitry Andric };
2712fe6060f1SDimitry Andric
2713fe6060f1SDimitry Andric MFMA = nullptr;
2714fe6060f1SDimitry Andric int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2715fe6060f1SDimitry Andric MaxWarWaitStates);
2716fe6060f1SDimitry Andric if (!MFMA)
2717fe6060f1SDimitry Andric continue;
2718fe6060f1SDimitry Andric
2719fe6060f1SDimitry Andric unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2720fe6060f1SDimitry Andric int NeedWaitStates = MaxWaitStates;
2721fe6060f1SDimitry Andric switch (HazardDefLatency) {
2722fe6060f1SDimitry Andric case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2723fe6060f1SDimitry Andric break;
272481ad6265SDimitry Andric case 4: assert(ST.hasGFX940Insts());
272581ad6265SDimitry Andric NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
272681ad6265SDimitry Andric break;
2727fe6060f1SDimitry Andric case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2728fe6060f1SDimitry Andric break;
2729bdd1243dSDimitry Andric case 16: [[fallthrough]];
2730fe6060f1SDimitry Andric default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2731fe6060f1SDimitry Andric break;
2732fe6060f1SDimitry Andric }
2733fe6060f1SDimitry Andric
2734fe6060f1SDimitry Andric int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2735fe6060f1SDimitry Andric WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736fe6060f1SDimitry Andric }
2737fe6060f1SDimitry Andric
2738fe6060f1SDimitry Andric return WaitStatesNeeded;
2739fe6060f1SDimitry Andric }
2740fe6060f1SDimitry Andric
ShouldPreferAnother(SUnit * SU)2741e8d8bef9SDimitry Andric bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2742e8d8bef9SDimitry Andric if (!SU->isInstr())
2743e8d8bef9SDimitry Andric return false;
2744e8d8bef9SDimitry Andric
2745fe6060f1SDimitry Andric const MachineInstr *MAI = nullptr;
274681ad6265SDimitry Andric
2747fe6060f1SDimitry Andric auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2748e8d8bef9SDimitry Andric MAI = nullptr;
274981ad6265SDimitry Andric if (SIInstrInfo::isMFMA(MI))
2750fe6060f1SDimitry Andric MAI = &MI;
2751e8d8bef9SDimitry Andric return MAI != nullptr;
2752e8d8bef9SDimitry Andric };
2753e8d8bef9SDimitry Andric
2754e8d8bef9SDimitry Andric MachineInstr *MI = SU->getInstr();
2755fe6060f1SDimitry Andric if (IsMFMAFn(*MI)) {
2756e8d8bef9SDimitry Andric int W = getWaitStatesSince(IsMFMAFn, 16);
2757e8d8bef9SDimitry Andric if (MAI)
2758e8d8bef9SDimitry Andric return W < (int)TSchedModel.computeInstrLatency(MAI);
2759e8d8bef9SDimitry Andric }
2760e8d8bef9SDimitry Andric
2761e8d8bef9SDimitry Andric return false;
2762e8d8bef9SDimitry Andric }
2763bdd1243dSDimitry Andric
fixVALUMaskWriteHazard(MachineInstr * MI)2764bdd1243dSDimitry Andric bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2765bdd1243dSDimitry Andric if (!ST.hasVALUMaskWriteHazard())
2766bdd1243dSDimitry Andric return false;
27677a6dacacSDimitry Andric assert(!ST.hasExtendedWaitCounts());
27687a6dacacSDimitry Andric
27697a6dacacSDimitry Andric if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2770bdd1243dSDimitry Andric return false;
2771bdd1243dSDimitry Andric
2772bdd1243dSDimitry Andric // The hazard sequence is three instructions:
2773bdd1243dSDimitry Andric // 1. VALU reads SGPR as mask
2774bdd1243dSDimitry Andric // 2. SALU writes SGPR
2775bdd1243dSDimitry Andric // 3. SALU reads SGPR
2776bdd1243dSDimitry Andric // The hazard can expire if the distance between 2 and 3 is sufficient.
2777bdd1243dSDimitry Andric // In practice this happens <10% of the time, hence this always assumes
2778bdd1243dSDimitry Andric // the hazard exists if 1 and 2 are present to avoid searching.
2779bdd1243dSDimitry Andric
2780bdd1243dSDimitry Andric const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2781bdd1243dSDimitry Andric if (!SDSTOp || !SDSTOp->isReg())
2782bdd1243dSDimitry Andric return false;
2783bdd1243dSDimitry Andric
2784bdd1243dSDimitry Andric const Register HazardReg = SDSTOp->getReg();
2785bdd1243dSDimitry Andric if (HazardReg == AMDGPU::EXEC ||
2786bdd1243dSDimitry Andric HazardReg == AMDGPU::EXEC_LO ||
2787bdd1243dSDimitry Andric HazardReg == AMDGPU::EXEC_HI ||
2788bdd1243dSDimitry Andric HazardReg == AMDGPU::M0)
2789bdd1243dSDimitry Andric return false;
2790bdd1243dSDimitry Andric
2791bdd1243dSDimitry Andric auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2792bdd1243dSDimitry Andric switch (I.getOpcode()) {
2793bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_e32:
2794bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_dpp:
2795bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_e32:
2796bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_dpp:
2797bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_e32:
2798bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_dpp:
2799bdd1243dSDimitry Andric case AMDGPU::V_DIV_FMAS_F32_e64:
2800bdd1243dSDimitry Andric case AMDGPU::V_DIV_FMAS_F64_e64:
2801bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_e32:
2802bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_dpp:
2803bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_e32:
2804bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_dpp:
2805bdd1243dSDimitry Andric // These implicitly read VCC as mask source.
2806bdd1243dSDimitry Andric return HazardReg == AMDGPU::VCC ||
2807bdd1243dSDimitry Andric HazardReg == AMDGPU::VCC_LO ||
2808bdd1243dSDimitry Andric HazardReg == AMDGPU::VCC_HI;
2809bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_e64:
2810bdd1243dSDimitry Andric case AMDGPU::V_ADDC_U32_e64_dpp:
2811bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_e64:
2812bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B16_e64_dpp:
2813bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_e64:
2814bdd1243dSDimitry Andric case AMDGPU::V_CNDMASK_B32_e64_dpp:
2815bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_e64:
2816bdd1243dSDimitry Andric case AMDGPU::V_SUBB_U32_e64_dpp:
2817bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_e64:
2818bdd1243dSDimitry Andric case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2819bdd1243dSDimitry Andric // Only check mask register overlaps.
2820bdd1243dSDimitry Andric const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2821bdd1243dSDimitry Andric assert(SSRCOp);
2822bdd1243dSDimitry Andric return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2823bdd1243dSDimitry Andric }
2824bdd1243dSDimitry Andric default:
2825bdd1243dSDimitry Andric return false;
2826bdd1243dSDimitry Andric }
2827bdd1243dSDimitry Andric };
2828bdd1243dSDimitry Andric
2829bdd1243dSDimitry Andric const MachineRegisterInfo &MRI = MF.getRegInfo();
2830bdd1243dSDimitry Andric auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2831bdd1243dSDimitry Andric // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2832bdd1243dSDimitry Andric if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
283306c3fb27SDimitry Andric AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2834bdd1243dSDimitry Andric return true;
2835bdd1243dSDimitry Andric
2836bdd1243dSDimitry Andric // VALU access to any SGPR or literal constant other than HazardReg
2837bdd1243dSDimitry Andric // mitigates hazard. No need to check HazardReg here as this will
2838bdd1243dSDimitry Andric // only be called when !IsHazardFn.
2839bdd1243dSDimitry Andric if (!SIInstrInfo::isVALU(I))
2840bdd1243dSDimitry Andric return false;
2841bdd1243dSDimitry Andric for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2842bdd1243dSDimitry Andric const MachineOperand &Op = I.getOperand(OpNo);
2843bdd1243dSDimitry Andric if (Op.isReg()) {
2844bdd1243dSDimitry Andric Register OpReg = Op.getReg();
2845bdd1243dSDimitry Andric // Only consider uses
2846bdd1243dSDimitry Andric if (!Op.isUse())
2847bdd1243dSDimitry Andric continue;
2848bdd1243dSDimitry Andric // Ignore EXEC
2849bdd1243dSDimitry Andric if (OpReg == AMDGPU::EXEC ||
2850bdd1243dSDimitry Andric OpReg == AMDGPU::EXEC_LO ||
2851bdd1243dSDimitry Andric OpReg == AMDGPU::EXEC_HI)
2852bdd1243dSDimitry Andric continue;
2853bdd1243dSDimitry Andric // Ignore all implicit uses except VCC
2854bdd1243dSDimitry Andric if (Op.isImplicit()) {
2855bdd1243dSDimitry Andric if (OpReg == AMDGPU::VCC ||
2856bdd1243dSDimitry Andric OpReg == AMDGPU::VCC_LO ||
2857bdd1243dSDimitry Andric OpReg == AMDGPU::VCC_HI)
2858bdd1243dSDimitry Andric return true;
2859bdd1243dSDimitry Andric continue;
2860bdd1243dSDimitry Andric }
2861bdd1243dSDimitry Andric if (TRI.isSGPRReg(MRI, OpReg))
2862bdd1243dSDimitry Andric return true;
2863bdd1243dSDimitry Andric } else {
2864bdd1243dSDimitry Andric const MCInstrDesc &InstDesc = I.getDesc();
2865bdd1243dSDimitry Andric const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2866bdd1243dSDimitry Andric if (!TII.isInlineConstant(Op, OpInfo))
2867bdd1243dSDimitry Andric return true;
2868bdd1243dSDimitry Andric }
2869bdd1243dSDimitry Andric }
2870bdd1243dSDimitry Andric return false;
2871bdd1243dSDimitry Andric };
2872bdd1243dSDimitry Andric
2873bdd1243dSDimitry Andric // Check for hazard
2874bdd1243dSDimitry Andric if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2875bdd1243dSDimitry Andric std::numeric_limits<int>::max())
2876bdd1243dSDimitry Andric return false;
2877bdd1243dSDimitry Andric
2878bdd1243dSDimitry Andric auto NextMI = std::next(MI->getIterator());
2879bdd1243dSDimitry Andric
2880bdd1243dSDimitry Andric // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881bdd1243dSDimitry Andric BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2882bdd1243dSDimitry Andric TII.get(AMDGPU::S_WAITCNT_DEPCTR))
288306c3fb27SDimitry Andric .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2884bdd1243dSDimitry Andric
2885bdd1243dSDimitry Andric // SALU write may be s_getpc in a bundle.
2886bdd1243dSDimitry Andric if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2887bdd1243dSDimitry Andric // Update offsets of any references in the bundle.
2888bdd1243dSDimitry Andric while (NextMI != MI->getParent()->end() &&
2889bdd1243dSDimitry Andric NextMI->isBundledWithPred()) {
2890bdd1243dSDimitry Andric for (auto &Operand : NextMI->operands()) {
2891bdd1243dSDimitry Andric if (Operand.isGlobal())
2892bdd1243dSDimitry Andric Operand.setOffset(Operand.getOffset() + 4);
2893bdd1243dSDimitry Andric }
2894bdd1243dSDimitry Andric NextMI++;
2895bdd1243dSDimitry Andric }
2896bdd1243dSDimitry Andric }
2897bdd1243dSDimitry Andric
2898bdd1243dSDimitry Andric return true;
2899bdd1243dSDimitry Andric }
2900*0fca6ea1SDimitry Andric
ensureEntrySetPrio(MachineFunction * MF,int Priority,const SIInstrInfo & TII)2901*0fca6ea1SDimitry Andric static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
2902*0fca6ea1SDimitry Andric const SIInstrInfo &TII) {
2903*0fca6ea1SDimitry Andric MachineBasicBlock &EntryMBB = MF->front();
2904*0fca6ea1SDimitry Andric if (EntryMBB.begin() != EntryMBB.end()) {
2905*0fca6ea1SDimitry Andric auto &EntryMI = *EntryMBB.begin();
2906*0fca6ea1SDimitry Andric if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2907*0fca6ea1SDimitry Andric EntryMI.getOperand(0).getImm() >= Priority)
2908*0fca6ea1SDimitry Andric return false;
2909*0fca6ea1SDimitry Andric }
2910*0fca6ea1SDimitry Andric
2911*0fca6ea1SDimitry Andric BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
2912*0fca6ea1SDimitry Andric .addImm(Priority);
2913*0fca6ea1SDimitry Andric return true;
2914*0fca6ea1SDimitry Andric }
2915*0fca6ea1SDimitry Andric
fixRequiredExportPriority(MachineInstr * MI)2916*0fca6ea1SDimitry Andric bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
2917*0fca6ea1SDimitry Andric if (!ST.hasRequiredExportPriority())
2918*0fca6ea1SDimitry Andric return false;
2919*0fca6ea1SDimitry Andric
2920*0fca6ea1SDimitry Andric // Assume the following shader types will never have exports,
2921*0fca6ea1SDimitry Andric // and avoid adding or adjusting S_SETPRIO.
2922*0fca6ea1SDimitry Andric MachineBasicBlock *MBB = MI->getParent();
2923*0fca6ea1SDimitry Andric MachineFunction *MF = MBB->getParent();
2924*0fca6ea1SDimitry Andric auto CC = MF->getFunction().getCallingConv();
2925*0fca6ea1SDimitry Andric switch (CC) {
2926*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_CS:
2927*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_CS_Chain:
2928*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_CS_ChainPreserve:
2929*0fca6ea1SDimitry Andric case CallingConv::AMDGPU_KERNEL:
2930*0fca6ea1SDimitry Andric return false;
2931*0fca6ea1SDimitry Andric default:
2932*0fca6ea1SDimitry Andric break;
2933*0fca6ea1SDimitry Andric }
2934*0fca6ea1SDimitry Andric
2935*0fca6ea1SDimitry Andric const int MaxPriority = 3;
2936*0fca6ea1SDimitry Andric const int NormalPriority = 2;
2937*0fca6ea1SDimitry Andric const int PostExportPriority = 0;
2938*0fca6ea1SDimitry Andric
2939*0fca6ea1SDimitry Andric auto It = MI->getIterator();
2940*0fca6ea1SDimitry Andric switch (MI->getOpcode()) {
2941*0fca6ea1SDimitry Andric case AMDGPU::S_ENDPGM:
2942*0fca6ea1SDimitry Andric case AMDGPU::S_ENDPGM_SAVED:
2943*0fca6ea1SDimitry Andric case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
2944*0fca6ea1SDimitry Andric case AMDGPU::SI_RETURN_TO_EPILOG:
2945*0fca6ea1SDimitry Andric // Ensure shader with calls raises priority at entry.
2946*0fca6ea1SDimitry Andric // This ensures correct priority if exports exist in callee.
2947*0fca6ea1SDimitry Andric if (MF->getFrameInfo().hasCalls())
2948*0fca6ea1SDimitry Andric return ensureEntrySetPrio(MF, NormalPriority, TII);
2949*0fca6ea1SDimitry Andric return false;
2950*0fca6ea1SDimitry Andric case AMDGPU::S_SETPRIO: {
2951*0fca6ea1SDimitry Andric // Raise minimum priority unless in workaround.
2952*0fca6ea1SDimitry Andric auto &PrioOp = MI->getOperand(0);
2953*0fca6ea1SDimitry Andric int Prio = PrioOp.getImm();
2954*0fca6ea1SDimitry Andric bool InWA = (Prio == PostExportPriority) &&
2955*0fca6ea1SDimitry Andric (It != MBB->begin() && TII.isEXP(*std::prev(It)));
2956*0fca6ea1SDimitry Andric if (InWA || Prio >= NormalPriority)
2957*0fca6ea1SDimitry Andric return false;
2958*0fca6ea1SDimitry Andric PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
2959*0fca6ea1SDimitry Andric return true;
2960*0fca6ea1SDimitry Andric }
2961*0fca6ea1SDimitry Andric default:
2962*0fca6ea1SDimitry Andric if (!TII.isEXP(*MI))
2963*0fca6ea1SDimitry Andric return false;
2964*0fca6ea1SDimitry Andric break;
2965*0fca6ea1SDimitry Andric }
2966*0fca6ea1SDimitry Andric
2967*0fca6ea1SDimitry Andric // Check entry priority at each export (as there will only be a few).
2968*0fca6ea1SDimitry Andric // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
2969*0fca6ea1SDimitry Andric bool Changed = false;
2970*0fca6ea1SDimitry Andric if (CC != CallingConv::AMDGPU_Gfx)
2971*0fca6ea1SDimitry Andric Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
2972*0fca6ea1SDimitry Andric
2973*0fca6ea1SDimitry Andric auto NextMI = std::next(It);
2974*0fca6ea1SDimitry Andric bool EndOfShader = false;
2975*0fca6ea1SDimitry Andric if (NextMI != MBB->end()) {
2976*0fca6ea1SDimitry Andric // Only need WA at end of sequence of exports.
2977*0fca6ea1SDimitry Andric if (TII.isEXP(*NextMI))
2978*0fca6ea1SDimitry Andric return Changed;
2979*0fca6ea1SDimitry Andric // Assume appropriate S_SETPRIO after export means WA already applied.
2980*0fca6ea1SDimitry Andric if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
2981*0fca6ea1SDimitry Andric NextMI->getOperand(0).getImm() == PostExportPriority)
2982*0fca6ea1SDimitry Andric return Changed;
2983*0fca6ea1SDimitry Andric EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
2984*0fca6ea1SDimitry Andric }
2985*0fca6ea1SDimitry Andric
2986*0fca6ea1SDimitry Andric const DebugLoc &DL = MI->getDebugLoc();
2987*0fca6ea1SDimitry Andric
2988*0fca6ea1SDimitry Andric // Lower priority.
2989*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
2990*0fca6ea1SDimitry Andric .addImm(PostExportPriority);
2991*0fca6ea1SDimitry Andric
2992*0fca6ea1SDimitry Andric if (!EndOfShader) {
2993*0fca6ea1SDimitry Andric // Wait for exports to complete.
2994*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
2995*0fca6ea1SDimitry Andric .addReg(AMDGPU::SGPR_NULL)
2996*0fca6ea1SDimitry Andric .addImm(0);
2997*0fca6ea1SDimitry Andric }
2998*0fca6ea1SDimitry Andric
2999*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3000*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3001*0fca6ea1SDimitry Andric
3002*0fca6ea1SDimitry Andric if (!EndOfShader) {
3003*0fca6ea1SDimitry Andric // Return to normal (higher) priority.
3004*0fca6ea1SDimitry Andric BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3005*0fca6ea1SDimitry Andric .addImm(NormalPriority);
3006*0fca6ea1SDimitry Andric }
3007*0fca6ea1SDimitry Andric
3008*0fca6ea1SDimitry Andric return true;
3009*0fca6ea1SDimitry Andric }
3010