1*700637cbSDimitry Andric //===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
2*700637cbSDimitry Andric //
3*700637cbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*700637cbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*700637cbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*700637cbSDimitry Andric //
7*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
8*700637cbSDimitry Andric //
9*700637cbSDimitry Andric /// \file
10*700637cbSDimitry Andric /// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
11*700637cbSDimitry Andric //
12*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
13*700637cbSDimitry Andric
14*700637cbSDimitry Andric #include "AMDGPUWaitSGPRHazards.h"
15*700637cbSDimitry Andric #include "AMDGPU.h"
16*700637cbSDimitry Andric #include "GCNSubtarget.h"
17*700637cbSDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18*700637cbSDimitry Andric #include "SIInstrInfo.h"
19*700637cbSDimitry Andric #include "llvm/ADT/SetVector.h"
20*700637cbSDimitry Andric
21*700637cbSDimitry Andric using namespace llvm;
22*700637cbSDimitry Andric
23*700637cbSDimitry Andric #define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
24*700637cbSDimitry Andric
25*700637cbSDimitry Andric static cl::opt<bool> GlobalEnableSGPRHazardWaits(
26*700637cbSDimitry Andric "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
27*700637cbSDimitry Andric cl::desc("Enable required s_wait_alu on SGPR hazards"));
28*700637cbSDimitry Andric
29*700637cbSDimitry Andric static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
30*700637cbSDimitry Andric "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
31*700637cbSDimitry Andric cl::desc("Cull hazards on function boundaries"));
32*700637cbSDimitry Andric
33*700637cbSDimitry Andric static cl::opt<bool>
34*700637cbSDimitry Andric GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
35*700637cbSDimitry Andric cl::init(false), cl::Hidden,
36*700637cbSDimitry Andric cl::desc("Cull hazards on memory waits"));
37*700637cbSDimitry Andric
38*700637cbSDimitry Andric static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
39*700637cbSDimitry Andric "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
40*700637cbSDimitry Andric cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
41*700637cbSDimitry Andric "wait"));
42*700637cbSDimitry Andric
43*700637cbSDimitry Andric namespace {
44*700637cbSDimitry Andric
45*700637cbSDimitry Andric class AMDGPUWaitSGPRHazards {
46*700637cbSDimitry Andric public:
47*700637cbSDimitry Andric const SIInstrInfo *TII;
48*700637cbSDimitry Andric const SIRegisterInfo *TRI;
49*700637cbSDimitry Andric const MachineRegisterInfo *MRI;
50*700637cbSDimitry Andric unsigned DsNopCount;
51*700637cbSDimitry Andric
52*700637cbSDimitry Andric bool EnableSGPRHazardWaits;
53*700637cbSDimitry Andric bool CullSGPRHazardsOnFunctionBoundary;
54*700637cbSDimitry Andric bool CullSGPRHazardsAtMemWait;
55*700637cbSDimitry Andric unsigned CullSGPRHazardsMemWaitThreshold;
56*700637cbSDimitry Andric
AMDGPUWaitSGPRHazards()57*700637cbSDimitry Andric AMDGPUWaitSGPRHazards() {}
58*700637cbSDimitry Andric
59*700637cbSDimitry Andric // Return the numeric ID 0-127 for a given SGPR.
sgprNumber(Register Reg,const SIRegisterInfo & TRI)60*700637cbSDimitry Andric static std::optional<unsigned> sgprNumber(Register Reg,
61*700637cbSDimitry Andric const SIRegisterInfo &TRI) {
62*700637cbSDimitry Andric switch (Reg) {
63*700637cbSDimitry Andric case AMDGPU::M0:
64*700637cbSDimitry Andric case AMDGPU::EXEC:
65*700637cbSDimitry Andric case AMDGPU::EXEC_LO:
66*700637cbSDimitry Andric case AMDGPU::EXEC_HI:
67*700637cbSDimitry Andric case AMDGPU::SGPR_NULL:
68*700637cbSDimitry Andric case AMDGPU::SGPR_NULL64:
69*700637cbSDimitry Andric return {};
70*700637cbSDimitry Andric default:
71*700637cbSDimitry Andric break;
72*700637cbSDimitry Andric }
73*700637cbSDimitry Andric unsigned RegN = TRI.getHWRegIndex(Reg);
74*700637cbSDimitry Andric if (RegN > 127)
75*700637cbSDimitry Andric return {};
76*700637cbSDimitry Andric return RegN;
77*700637cbSDimitry Andric }
78*700637cbSDimitry Andric
isVCC(Register Reg)79*700637cbSDimitry Andric static inline bool isVCC(Register Reg) {
80*700637cbSDimitry Andric return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
81*700637cbSDimitry Andric }
82*700637cbSDimitry Andric
83*700637cbSDimitry Andric // Adjust global offsets for instructions bundled with S_GETPC_B64 after
84*700637cbSDimitry Andric // insertion of a new instruction.
updateGetPCBundle(MachineInstr * NewMI)85*700637cbSDimitry Andric static void updateGetPCBundle(MachineInstr *NewMI) {
86*700637cbSDimitry Andric if (!NewMI->isBundled())
87*700637cbSDimitry Andric return;
88*700637cbSDimitry Andric
89*700637cbSDimitry Andric // Find start of bundle.
90*700637cbSDimitry Andric auto I = NewMI->getIterator();
91*700637cbSDimitry Andric while (I->isBundledWithPred())
92*700637cbSDimitry Andric I--;
93*700637cbSDimitry Andric if (I->isBundle())
94*700637cbSDimitry Andric I++;
95*700637cbSDimitry Andric
96*700637cbSDimitry Andric // Bail if this is not an S_GETPC bundle.
97*700637cbSDimitry Andric if (I->getOpcode() != AMDGPU::S_GETPC_B64)
98*700637cbSDimitry Andric return;
99*700637cbSDimitry Andric
100*700637cbSDimitry Andric // Update offsets of any references in the bundle.
101*700637cbSDimitry Andric const unsigned NewBytes = 4;
102*700637cbSDimitry Andric assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
103*700637cbSDimitry Andric "Unexpected instruction insertion in bundle");
104*700637cbSDimitry Andric auto NextMI = std::next(NewMI->getIterator());
105*700637cbSDimitry Andric auto End = NewMI->getParent()->end();
106*700637cbSDimitry Andric while (NextMI != End && NextMI->isBundledWithPred()) {
107*700637cbSDimitry Andric for (auto &Operand : NextMI->operands()) {
108*700637cbSDimitry Andric if (Operand.isGlobal())
109*700637cbSDimitry Andric Operand.setOffset(Operand.getOffset() + NewBytes);
110*700637cbSDimitry Andric }
111*700637cbSDimitry Andric NextMI++;
112*700637cbSDimitry Andric }
113*700637cbSDimitry Andric }
114*700637cbSDimitry Andric
115*700637cbSDimitry Andric struct HazardState {
116*700637cbSDimitry Andric static constexpr unsigned None = 0;
117*700637cbSDimitry Andric static constexpr unsigned SALU = (1 << 0);
118*700637cbSDimitry Andric static constexpr unsigned VALU = (1 << 1);
119*700637cbSDimitry Andric
120*700637cbSDimitry Andric std::bitset<64> Tracked; // SGPR banks ever read by VALU
121*700637cbSDimitry Andric std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
122*700637cbSDimitry Andric std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
123*700637cbSDimitry Andric unsigned VCCHazard = None; // Source of current VCC writes
124*700637cbSDimitry Andric bool ActiveFlat = false; // Has unwaited flat instructions
125*700637cbSDimitry Andric
merge__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState126*700637cbSDimitry Andric bool merge(const HazardState &RHS) {
127*700637cbSDimitry Andric HazardState Orig(*this);
128*700637cbSDimitry Andric *this |= RHS;
129*700637cbSDimitry Andric return (*this != Orig);
130*700637cbSDimitry Andric }
131*700637cbSDimitry Andric
operator ==__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState132*700637cbSDimitry Andric bool operator==(const HazardState &RHS) const {
133*700637cbSDimitry Andric return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
134*700637cbSDimitry Andric VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
135*700637cbSDimitry Andric ActiveFlat == RHS.ActiveFlat;
136*700637cbSDimitry Andric }
137*700637cbSDimitry Andric
operator !=__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState138*700637cbSDimitry Andric bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }
139*700637cbSDimitry Andric
operator |=__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState140*700637cbSDimitry Andric void operator|=(const HazardState &RHS) {
141*700637cbSDimitry Andric Tracked |= RHS.Tracked;
142*700637cbSDimitry Andric SALUHazards |= RHS.SALUHazards;
143*700637cbSDimitry Andric VALUHazards |= RHS.VALUHazards;
144*700637cbSDimitry Andric VCCHazard |= RHS.VCCHazard;
145*700637cbSDimitry Andric ActiveFlat |= RHS.ActiveFlat;
146*700637cbSDimitry Andric }
147*700637cbSDimitry Andric };
148*700637cbSDimitry Andric
149*700637cbSDimitry Andric struct BlockHazardState {
150*700637cbSDimitry Andric HazardState In;
151*700637cbSDimitry Andric HazardState Out;
152*700637cbSDimitry Andric };
153*700637cbSDimitry Andric
154*700637cbSDimitry Andric DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
155*700637cbSDimitry Andric
156*700637cbSDimitry Andric static constexpr unsigned WAVE32_NOPS = 4;
157*700637cbSDimitry Andric static constexpr unsigned WAVE64_NOPS = 8;
158*700637cbSDimitry Andric
insertHazardCull(MachineBasicBlock & MBB,MachineBasicBlock::instr_iterator & MI)159*700637cbSDimitry Andric void insertHazardCull(MachineBasicBlock &MBB,
160*700637cbSDimitry Andric MachineBasicBlock::instr_iterator &MI) {
161*700637cbSDimitry Andric assert(!MI->isBundled());
162*700637cbSDimitry Andric unsigned Count = DsNopCount;
163*700637cbSDimitry Andric while (Count--)
164*700637cbSDimitry Andric BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
165*700637cbSDimitry Andric }
166*700637cbSDimitry Andric
mergeMasks(unsigned Mask1,unsigned Mask2)167*700637cbSDimitry Andric unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
168*700637cbSDimitry Andric unsigned Mask = 0xffff;
169*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
170*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
171*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
172*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
173*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
174*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
175*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
176*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
177*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
178*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
179*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
180*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
181*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
182*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
183*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
184*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
185*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
186*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
187*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
188*700637cbSDimitry Andric Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
189*700637cbSDimitry Andric AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
190*700637cbSDimitry Andric return Mask;
191*700637cbSDimitry Andric }
192*700637cbSDimitry Andric
mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator & MI,unsigned Mask)193*700637cbSDimitry Andric bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
194*700637cbSDimitry Andric unsigned Mask) {
195*700637cbSDimitry Andric auto MBB = MI->getParent();
196*700637cbSDimitry Andric if (MI == MBB->instr_begin())
197*700637cbSDimitry Andric return false;
198*700637cbSDimitry Andric
199*700637cbSDimitry Andric auto It = prev_nodbg(MI, MBB->instr_begin());
200*700637cbSDimitry Andric if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
201*700637cbSDimitry Andric return false;
202*700637cbSDimitry Andric
203*700637cbSDimitry Andric It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
204*700637cbSDimitry Andric return true;
205*700637cbSDimitry Andric }
206*700637cbSDimitry Andric
runOnMachineBasicBlock(MachineBasicBlock & MBB,bool Emit)207*700637cbSDimitry Andric bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
208*700637cbSDimitry Andric enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
209*700637cbSDimitry Andric
210*700637cbSDimitry Andric HazardState State = BlockState[&MBB].In;
211*700637cbSDimitry Andric SmallSet<Register, 8> SeenRegs;
212*700637cbSDimitry Andric bool Emitted = false;
213*700637cbSDimitry Andric unsigned DsNops = 0;
214*700637cbSDimitry Andric
215*700637cbSDimitry Andric for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
216*700637cbSDimitry Andric E = MBB.instr_end();
217*700637cbSDimitry Andric MI != E; ++MI) {
218*700637cbSDimitry Andric if (MI->isMetaInstruction())
219*700637cbSDimitry Andric continue;
220*700637cbSDimitry Andric
221*700637cbSDimitry Andric // Clear tracked SGPRs if sufficient DS_NOPs occur
222*700637cbSDimitry Andric if (MI->getOpcode() == AMDGPU::DS_NOP) {
223*700637cbSDimitry Andric if (++DsNops >= DsNopCount)
224*700637cbSDimitry Andric State.Tracked.reset();
225*700637cbSDimitry Andric continue;
226*700637cbSDimitry Andric }
227*700637cbSDimitry Andric DsNops = 0;
228*700637cbSDimitry Andric
229*700637cbSDimitry Andric // Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
230*700637cbSDimitry Andric // Culls could be disproportionate in cost to load time.
231*700637cbSDimitry Andric if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI))
232*700637cbSDimitry Andric State.ActiveFlat = true;
233*700637cbSDimitry Andric
234*700637cbSDimitry Andric // SMEM or VMEM clears hazards
235*700637cbSDimitry Andric // FIXME: adapt to add FLAT without VALU (so !isLDSDMA())?
236*700637cbSDimitry Andric if ((SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI)) ||
237*700637cbSDimitry Andric SIInstrInfo::isSMRD(*MI)) {
238*700637cbSDimitry Andric State.VCCHazard = HazardState::None;
239*700637cbSDimitry Andric State.SALUHazards.reset();
240*700637cbSDimitry Andric State.VALUHazards.reset();
241*700637cbSDimitry Andric continue;
242*700637cbSDimitry Andric }
243*700637cbSDimitry Andric
244*700637cbSDimitry Andric // Existing S_WAITALU can clear hazards
245*700637cbSDimitry Andric if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
246*700637cbSDimitry Andric unsigned int Mask = MI->getOperand(0).getImm();
247*700637cbSDimitry Andric if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
248*700637cbSDimitry Andric State.VCCHazard &= ~HazardState::VALU;
249*700637cbSDimitry Andric if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
250*700637cbSDimitry Andric State.SALUHazards.reset();
251*700637cbSDimitry Andric State.VCCHazard &= ~HazardState::SALU;
252*700637cbSDimitry Andric }
253*700637cbSDimitry Andric if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
254*700637cbSDimitry Andric State.VALUHazards.reset();
255*700637cbSDimitry Andric continue;
256*700637cbSDimitry Andric }
257*700637cbSDimitry Andric
258*700637cbSDimitry Andric // Snoop counter waits to insert culls
259*700637cbSDimitry Andric if (CullSGPRHazardsAtMemWait &&
260*700637cbSDimitry Andric (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
261*700637cbSDimitry Andric MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
262*700637cbSDimitry Andric MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
263*700637cbSDimitry Andric (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
264*700637cbSDimitry Andric (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
265*700637cbSDimitry Andric if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
266*700637cbSDimitry Andric State.ActiveFlat = false;
267*700637cbSDimitry Andric } else {
268*700637cbSDimitry Andric State.Tracked.reset();
269*700637cbSDimitry Andric if (Emit)
270*700637cbSDimitry Andric insertHazardCull(MBB, MI);
271*700637cbSDimitry Andric continue;
272*700637cbSDimitry Andric }
273*700637cbSDimitry Andric }
274*700637cbSDimitry Andric
275*700637cbSDimitry Andric // Process only VALUs and SALUs
276*700637cbSDimitry Andric bool IsVALU = SIInstrInfo::isVALU(*MI);
277*700637cbSDimitry Andric bool IsSALU = SIInstrInfo::isSALU(*MI);
278*700637cbSDimitry Andric if (!IsVALU && !IsSALU)
279*700637cbSDimitry Andric continue;
280*700637cbSDimitry Andric
281*700637cbSDimitry Andric unsigned Wait = 0;
282*700637cbSDimitry Andric
283*700637cbSDimitry Andric auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
284*700637cbSDimitry Andric if (!Op.isReg())
285*700637cbSDimitry Andric return;
286*700637cbSDimitry Andric Register Reg = Op.getReg();
287*700637cbSDimitry Andric assert(!Op.getSubReg());
288*700637cbSDimitry Andric if (!TRI->isSGPRReg(*MRI, Reg))
289*700637cbSDimitry Andric return;
290*700637cbSDimitry Andric
291*700637cbSDimitry Andric // Only visit each register once
292*700637cbSDimitry Andric if (!SeenRegs.insert(Reg).second)
293*700637cbSDimitry Andric return;
294*700637cbSDimitry Andric
295*700637cbSDimitry Andric auto RegNumber = sgprNumber(Reg, *TRI);
296*700637cbSDimitry Andric if (!RegNumber)
297*700637cbSDimitry Andric return;
298*700637cbSDimitry Andric
299*700637cbSDimitry Andric // Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
300*700637cbSDimitry Andric // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
301*700637cbSDimitry Andric unsigned RegN = *RegNumber;
302*700637cbSDimitry Andric unsigned PairN = (RegN >> 1) & 0x3f;
303*700637cbSDimitry Andric
304*700637cbSDimitry Andric // Read/write of untracked register is safe; but must record any new
305*700637cbSDimitry Andric // reads.
306*700637cbSDimitry Andric if (!State.Tracked[PairN]) {
307*700637cbSDimitry Andric if (IsVALU && IsUse)
308*700637cbSDimitry Andric State.Tracked.set(PairN);
309*700637cbSDimitry Andric return;
310*700637cbSDimitry Andric }
311*700637cbSDimitry Andric
312*700637cbSDimitry Andric uint8_t SGPRCount =
313*700637cbSDimitry Andric AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32;
314*700637cbSDimitry Andric
315*700637cbSDimitry Andric if (IsUse) {
316*700637cbSDimitry Andric // SALU reading SGPR clears VALU hazards
317*700637cbSDimitry Andric if (IsSALU) {
318*700637cbSDimitry Andric if (isVCC(Reg)) {
319*700637cbSDimitry Andric if (State.VCCHazard & HazardState::VALU)
320*700637cbSDimitry Andric State.VCCHazard = HazardState::None;
321*700637cbSDimitry Andric } else {
322*700637cbSDimitry Andric State.VALUHazards.reset();
323*700637cbSDimitry Andric }
324*700637cbSDimitry Andric }
325*700637cbSDimitry Andric // Compute required waits
326*700637cbSDimitry Andric for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
327*700637cbSDimitry Andric Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
328*700637cbSDimitry Andric Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
329*700637cbSDimitry Andric }
330*700637cbSDimitry Andric if (isVCC(Reg) && State.VCCHazard) {
331*700637cbSDimitry Andric // Note: it's possible for both SALU and VALU to exist if VCC
332*700637cbSDimitry Andric // was updated differently by merged predecessors.
333*700637cbSDimitry Andric if (State.VCCHazard & HazardState::SALU)
334*700637cbSDimitry Andric Wait |= WA_SALU;
335*700637cbSDimitry Andric if (State.VCCHazard & HazardState::VALU)
336*700637cbSDimitry Andric Wait |= WA_VCC;
337*700637cbSDimitry Andric }
338*700637cbSDimitry Andric } else {
339*700637cbSDimitry Andric // Update hazards
340*700637cbSDimitry Andric if (isVCC(Reg)) {
341*700637cbSDimitry Andric State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
342*700637cbSDimitry Andric } else {
343*700637cbSDimitry Andric for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
344*700637cbSDimitry Andric if (IsSALU)
345*700637cbSDimitry Andric State.SALUHazards.set(RegN + RegIdx);
346*700637cbSDimitry Andric else
347*700637cbSDimitry Andric State.VALUHazards.set(RegN + RegIdx);
348*700637cbSDimitry Andric }
349*700637cbSDimitry Andric }
350*700637cbSDimitry Andric }
351*700637cbSDimitry Andric };
352*700637cbSDimitry Andric
353*700637cbSDimitry Andric const bool IsSetPC =
354*700637cbSDimitry Andric (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) &&
355*700637cbSDimitry Andric MI->getOpcode() != AMDGPU::S_ENDPGM &&
356*700637cbSDimitry Andric MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
357*700637cbSDimitry Andric
358*700637cbSDimitry Andric // Only consider implicit VCC specified by instruction descriptor.
359*700637cbSDimitry Andric const bool HasImplicitVCC =
360*700637cbSDimitry Andric llvm::any_of(MI->getDesc().implicit_uses(), isVCC) ||
361*700637cbSDimitry Andric llvm::any_of(MI->getDesc().implicit_defs(), isVCC);
362*700637cbSDimitry Andric
363*700637cbSDimitry Andric if (IsSetPC) {
364*700637cbSDimitry Andric // All SGPR writes before a call/return must be flushed as the
365*700637cbSDimitry Andric // callee/caller will not will not see the hazard chain.
366*700637cbSDimitry Andric if (State.VCCHazard & HazardState::VALU)
367*700637cbSDimitry Andric Wait |= WA_VCC;
368*700637cbSDimitry Andric if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
369*700637cbSDimitry Andric Wait |= WA_SALU;
370*700637cbSDimitry Andric if (State.VALUHazards.any())
371*700637cbSDimitry Andric Wait |= WA_VALU;
372*700637cbSDimitry Andric if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
373*700637cbSDimitry Andric State.Tracked.reset();
374*700637cbSDimitry Andric if (Emit)
375*700637cbSDimitry Andric insertHazardCull(MBB, MI);
376*700637cbSDimitry Andric }
377*700637cbSDimitry Andric } else {
378*700637cbSDimitry Andric // Process uses to determine required wait.
379*700637cbSDimitry Andric SeenRegs.clear();
380*700637cbSDimitry Andric for (const MachineOperand &Op : MI->all_uses()) {
381*700637cbSDimitry Andric if (Op.isImplicit() &&
382*700637cbSDimitry Andric (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
383*700637cbSDimitry Andric continue;
384*700637cbSDimitry Andric processOperand(Op, true);
385*700637cbSDimitry Andric }
386*700637cbSDimitry Andric }
387*700637cbSDimitry Andric
388*700637cbSDimitry Andric // Apply wait
389*700637cbSDimitry Andric if (Wait) {
390*700637cbSDimitry Andric unsigned Mask = 0xffff;
391*700637cbSDimitry Andric if (Wait & WA_VCC) {
392*700637cbSDimitry Andric State.VCCHazard &= ~HazardState::VALU;
393*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
394*700637cbSDimitry Andric }
395*700637cbSDimitry Andric if (Wait & WA_SALU) {
396*700637cbSDimitry Andric State.SALUHazards.reset();
397*700637cbSDimitry Andric State.VCCHazard &= ~HazardState::SALU;
398*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
399*700637cbSDimitry Andric }
400*700637cbSDimitry Andric if (Wait & WA_VALU) {
401*700637cbSDimitry Andric State.VALUHazards.reset();
402*700637cbSDimitry Andric Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
403*700637cbSDimitry Andric }
404*700637cbSDimitry Andric if (Emit) {
405*700637cbSDimitry Andric if (!mergeConsecutiveWaitAlus(MI, Mask)) {
406*700637cbSDimitry Andric auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
407*700637cbSDimitry Andric TII->get(AMDGPU::S_WAITCNT_DEPCTR))
408*700637cbSDimitry Andric .addImm(Mask);
409*700637cbSDimitry Andric updateGetPCBundle(NewMI);
410*700637cbSDimitry Andric }
411*700637cbSDimitry Andric Emitted = true;
412*700637cbSDimitry Andric }
413*700637cbSDimitry Andric }
414*700637cbSDimitry Andric
415*700637cbSDimitry Andric // On return from a call SGPR state is unknown, so all potential hazards.
416*700637cbSDimitry Andric if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
417*700637cbSDimitry Andric State.Tracked.set();
418*700637cbSDimitry Andric
419*700637cbSDimitry Andric // Update hazards based on defs.
420*700637cbSDimitry Andric SeenRegs.clear();
421*700637cbSDimitry Andric for (const MachineOperand &Op : MI->all_defs()) {
422*700637cbSDimitry Andric if (Op.isImplicit() &&
423*700637cbSDimitry Andric (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
424*700637cbSDimitry Andric continue;
425*700637cbSDimitry Andric processOperand(Op, false);
426*700637cbSDimitry Andric }
427*700637cbSDimitry Andric }
428*700637cbSDimitry Andric
429*700637cbSDimitry Andric BlockHazardState &BS = BlockState[&MBB];
430*700637cbSDimitry Andric bool Changed = State != BS.Out;
431*700637cbSDimitry Andric if (Emit) {
432*700637cbSDimitry Andric assert(!Changed && "Hazard state should not change on emit pass");
433*700637cbSDimitry Andric return Emitted;
434*700637cbSDimitry Andric }
435*700637cbSDimitry Andric if (Changed)
436*700637cbSDimitry Andric BS.Out = State;
437*700637cbSDimitry Andric return Changed;
438*700637cbSDimitry Andric }
439*700637cbSDimitry Andric
run(MachineFunction & MF)440*700637cbSDimitry Andric bool run(MachineFunction &MF) {
441*700637cbSDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
442*700637cbSDimitry Andric if (!ST.hasVALUReadSGPRHazard())
443*700637cbSDimitry Andric return false;
444*700637cbSDimitry Andric
445*700637cbSDimitry Andric // Parse settings
446*700637cbSDimitry Andric EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
447*700637cbSDimitry Andric CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
448*700637cbSDimitry Andric CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
449*700637cbSDimitry Andric CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;
450*700637cbSDimitry Andric
451*700637cbSDimitry Andric if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
452*700637cbSDimitry Andric EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
453*700637cbSDimitry Andric "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
454*700637cbSDimitry Andric if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
455*700637cbSDimitry Andric CullSGPRHazardsOnFunctionBoundary =
456*700637cbSDimitry Andric MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
457*700637cbSDimitry Andric if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
458*700637cbSDimitry Andric CullSGPRHazardsAtMemWait =
459*700637cbSDimitry Andric MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
460*700637cbSDimitry Andric if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
461*700637cbSDimitry Andric CullSGPRHazardsMemWaitThreshold =
462*700637cbSDimitry Andric MF.getFunction().getFnAttributeAsParsedInteger(
463*700637cbSDimitry Andric "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
464*700637cbSDimitry Andric CullSGPRHazardsMemWaitThreshold);
465*700637cbSDimitry Andric
466*700637cbSDimitry Andric // Bail if disabled
467*700637cbSDimitry Andric if (!EnableSGPRHazardWaits)
468*700637cbSDimitry Andric return false;
469*700637cbSDimitry Andric
470*700637cbSDimitry Andric TII = ST.getInstrInfo();
471*700637cbSDimitry Andric TRI = ST.getRegisterInfo();
472*700637cbSDimitry Andric MRI = &MF.getRegInfo();
473*700637cbSDimitry Andric DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
474*700637cbSDimitry Andric
475*700637cbSDimitry Andric auto CallingConv = MF.getFunction().getCallingConv();
476*700637cbSDimitry Andric if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
477*700637cbSDimitry Andric !CullSGPRHazardsOnFunctionBoundary) {
478*700637cbSDimitry Andric // Callee must consider all SGPRs as tracked.
479*700637cbSDimitry Andric LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
480*700637cbSDimitry Andric MachineBasicBlock &EntryBlock = MF.front();
481*700637cbSDimitry Andric BlockState[&EntryBlock].In.Tracked.set();
482*700637cbSDimitry Andric }
483*700637cbSDimitry Andric
484*700637cbSDimitry Andric // Calculate the hazard state for each basic block.
485*700637cbSDimitry Andric // Iterate until a fixed point is reached.
486*700637cbSDimitry Andric // Fixed point is guaranteed as merge function only ever increases
487*700637cbSDimitry Andric // the hazard set, and all backedges will cause a merge.
488*700637cbSDimitry Andric //
489*700637cbSDimitry Andric // Note: we have to take care of the entry block as this technically
490*700637cbSDimitry Andric // has an edge from outside the function. Failure to treat this as
491*700637cbSDimitry Andric // a merge could prevent fixed point being reached.
492*700637cbSDimitry Andric SetVector<MachineBasicBlock *> Worklist;
493*700637cbSDimitry Andric for (auto &MBB : reverse(MF))
494*700637cbSDimitry Andric Worklist.insert(&MBB);
495*700637cbSDimitry Andric while (!Worklist.empty()) {
496*700637cbSDimitry Andric auto &MBB = *Worklist.pop_back_val();
497*700637cbSDimitry Andric bool Changed = runOnMachineBasicBlock(MBB, false);
498*700637cbSDimitry Andric if (Changed) {
499*700637cbSDimitry Andric // Note: take a copy of state here in case it is reallocated by map
500*700637cbSDimitry Andric HazardState NewState = BlockState[&MBB].Out;
501*700637cbSDimitry Andric // Propagate to all successor blocks
502*700637cbSDimitry Andric for (auto Succ : MBB.successors()) {
503*700637cbSDimitry Andric // We only need to merge hazards at CFG merge points.
504*700637cbSDimitry Andric auto &SuccState = BlockState[Succ];
505*700637cbSDimitry Andric if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
506*700637cbSDimitry Andric if (SuccState.In != NewState) {
507*700637cbSDimitry Andric SuccState.In = NewState;
508*700637cbSDimitry Andric Worklist.insert(Succ);
509*700637cbSDimitry Andric }
510*700637cbSDimitry Andric } else if (SuccState.In.merge(NewState)) {
511*700637cbSDimitry Andric Worklist.insert(Succ);
512*700637cbSDimitry Andric }
513*700637cbSDimitry Andric }
514*700637cbSDimitry Andric }
515*700637cbSDimitry Andric }
516*700637cbSDimitry Andric
517*700637cbSDimitry Andric LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");
518*700637cbSDimitry Andric
519*700637cbSDimitry Andric // Final to emit wait instructions.
520*700637cbSDimitry Andric bool Changed = false;
521*700637cbSDimitry Andric for (auto &MBB : MF)
522*700637cbSDimitry Andric Changed |= runOnMachineBasicBlock(MBB, true);
523*700637cbSDimitry Andric
524*700637cbSDimitry Andric BlockState.clear();
525*700637cbSDimitry Andric return Changed;
526*700637cbSDimitry Andric }
527*700637cbSDimitry Andric };
528*700637cbSDimitry Andric
529*700637cbSDimitry Andric class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass {
530*700637cbSDimitry Andric public:
531*700637cbSDimitry Andric static char ID;
532*700637cbSDimitry Andric
AMDGPUWaitSGPRHazardsLegacy()533*700637cbSDimitry Andric AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}
534*700637cbSDimitry Andric
runOnMachineFunction(MachineFunction & MF)535*700637cbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override {
536*700637cbSDimitry Andric return AMDGPUWaitSGPRHazards().run(MF);
537*700637cbSDimitry Andric }
538*700637cbSDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const539*700637cbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
540*700637cbSDimitry Andric AU.setPreservesCFG();
541*700637cbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
542*700637cbSDimitry Andric }
543*700637cbSDimitry Andric };
544*700637cbSDimitry Andric
545*700637cbSDimitry Andric } // namespace
546*700637cbSDimitry Andric
547*700637cbSDimitry Andric char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
548*700637cbSDimitry Andric
549*700637cbSDimitry Andric char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID;
550*700637cbSDimitry Andric
551*700637cbSDimitry Andric INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE,
552*700637cbSDimitry Andric "AMDGPU Insert waits for SGPR read hazards", false, false)
553*700637cbSDimitry Andric
554*700637cbSDimitry Andric PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)555*700637cbSDimitry Andric AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
556*700637cbSDimitry Andric MachineFunctionAnalysisManager &MFAM) {
557*700637cbSDimitry Andric if (AMDGPUWaitSGPRHazards().run(MF))
558*700637cbSDimitry Andric return PreservedAnalyses::none();
559*700637cbSDimitry Andric return PreservedAnalyses::all();
560*700637cbSDimitry Andric }
561