xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1*700637cbSDimitry Andric //===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
2*700637cbSDimitry Andric //
3*700637cbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*700637cbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*700637cbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*700637cbSDimitry Andric //
7*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
8*700637cbSDimitry Andric //
9*700637cbSDimitry Andric /// \file
10*700637cbSDimitry Andric /// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
11*700637cbSDimitry Andric //
12*700637cbSDimitry Andric //===----------------------------------------------------------------------===//
13*700637cbSDimitry Andric 
14*700637cbSDimitry Andric #include "AMDGPUWaitSGPRHazards.h"
15*700637cbSDimitry Andric #include "AMDGPU.h"
16*700637cbSDimitry Andric #include "GCNSubtarget.h"
17*700637cbSDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18*700637cbSDimitry Andric #include "SIInstrInfo.h"
19*700637cbSDimitry Andric #include "llvm/ADT/SetVector.h"
20*700637cbSDimitry Andric 
21*700637cbSDimitry Andric using namespace llvm;
22*700637cbSDimitry Andric 
23*700637cbSDimitry Andric #define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
24*700637cbSDimitry Andric 
25*700637cbSDimitry Andric static cl::opt<bool> GlobalEnableSGPRHazardWaits(
26*700637cbSDimitry Andric     "amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
27*700637cbSDimitry Andric     cl::desc("Enable required s_wait_alu on SGPR hazards"));
28*700637cbSDimitry Andric 
29*700637cbSDimitry Andric static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
30*700637cbSDimitry Andric     "amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
31*700637cbSDimitry Andric     cl::desc("Cull hazards on function boundaries"));
32*700637cbSDimitry Andric 
33*700637cbSDimitry Andric static cl::opt<bool>
34*700637cbSDimitry Andric     GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
35*700637cbSDimitry Andric                                    cl::init(false), cl::Hidden,
36*700637cbSDimitry Andric                                    cl::desc("Cull hazards on memory waits"));
37*700637cbSDimitry Andric 
38*700637cbSDimitry Andric static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
39*700637cbSDimitry Andric     "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
40*700637cbSDimitry Andric     cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
41*700637cbSDimitry Andric              "wait"));
42*700637cbSDimitry Andric 
43*700637cbSDimitry Andric namespace {
44*700637cbSDimitry Andric 
45*700637cbSDimitry Andric class AMDGPUWaitSGPRHazards {
46*700637cbSDimitry Andric public:
47*700637cbSDimitry Andric   const SIInstrInfo *TII;
48*700637cbSDimitry Andric   const SIRegisterInfo *TRI;
49*700637cbSDimitry Andric   const MachineRegisterInfo *MRI;
50*700637cbSDimitry Andric   unsigned DsNopCount;
51*700637cbSDimitry Andric 
52*700637cbSDimitry Andric   bool EnableSGPRHazardWaits;
53*700637cbSDimitry Andric   bool CullSGPRHazardsOnFunctionBoundary;
54*700637cbSDimitry Andric   bool CullSGPRHazardsAtMemWait;
55*700637cbSDimitry Andric   unsigned CullSGPRHazardsMemWaitThreshold;
56*700637cbSDimitry Andric 
AMDGPUWaitSGPRHazards()57*700637cbSDimitry Andric   AMDGPUWaitSGPRHazards() {}
58*700637cbSDimitry Andric 
59*700637cbSDimitry Andric   // Return the numeric ID 0-127 for a given SGPR.
sgprNumber(Register Reg,const SIRegisterInfo & TRI)60*700637cbSDimitry Andric   static std::optional<unsigned> sgprNumber(Register Reg,
61*700637cbSDimitry Andric                                             const SIRegisterInfo &TRI) {
62*700637cbSDimitry Andric     switch (Reg) {
63*700637cbSDimitry Andric     case AMDGPU::M0:
64*700637cbSDimitry Andric     case AMDGPU::EXEC:
65*700637cbSDimitry Andric     case AMDGPU::EXEC_LO:
66*700637cbSDimitry Andric     case AMDGPU::EXEC_HI:
67*700637cbSDimitry Andric     case AMDGPU::SGPR_NULL:
68*700637cbSDimitry Andric     case AMDGPU::SGPR_NULL64:
69*700637cbSDimitry Andric       return {};
70*700637cbSDimitry Andric     default:
71*700637cbSDimitry Andric       break;
72*700637cbSDimitry Andric     }
73*700637cbSDimitry Andric     unsigned RegN = TRI.getHWRegIndex(Reg);
74*700637cbSDimitry Andric     if (RegN > 127)
75*700637cbSDimitry Andric       return {};
76*700637cbSDimitry Andric     return RegN;
77*700637cbSDimitry Andric   }
78*700637cbSDimitry Andric 
isVCC(Register Reg)79*700637cbSDimitry Andric   static inline bool isVCC(Register Reg) {
80*700637cbSDimitry Andric     return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
81*700637cbSDimitry Andric   }
82*700637cbSDimitry Andric 
83*700637cbSDimitry Andric   // Adjust global offsets for instructions bundled with S_GETPC_B64 after
84*700637cbSDimitry Andric   // insertion of a new instruction.
updateGetPCBundle(MachineInstr * NewMI)85*700637cbSDimitry Andric   static void updateGetPCBundle(MachineInstr *NewMI) {
86*700637cbSDimitry Andric     if (!NewMI->isBundled())
87*700637cbSDimitry Andric       return;
88*700637cbSDimitry Andric 
89*700637cbSDimitry Andric     // Find start of bundle.
90*700637cbSDimitry Andric     auto I = NewMI->getIterator();
91*700637cbSDimitry Andric     while (I->isBundledWithPred())
92*700637cbSDimitry Andric       I--;
93*700637cbSDimitry Andric     if (I->isBundle())
94*700637cbSDimitry Andric       I++;
95*700637cbSDimitry Andric 
96*700637cbSDimitry Andric     // Bail if this is not an S_GETPC bundle.
97*700637cbSDimitry Andric     if (I->getOpcode() != AMDGPU::S_GETPC_B64)
98*700637cbSDimitry Andric       return;
99*700637cbSDimitry Andric 
100*700637cbSDimitry Andric     // Update offsets of any references in the bundle.
101*700637cbSDimitry Andric     const unsigned NewBytes = 4;
102*700637cbSDimitry Andric     assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
103*700637cbSDimitry Andric            "Unexpected instruction insertion in bundle");
104*700637cbSDimitry Andric     auto NextMI = std::next(NewMI->getIterator());
105*700637cbSDimitry Andric     auto End = NewMI->getParent()->end();
106*700637cbSDimitry Andric     while (NextMI != End && NextMI->isBundledWithPred()) {
107*700637cbSDimitry Andric       for (auto &Operand : NextMI->operands()) {
108*700637cbSDimitry Andric         if (Operand.isGlobal())
109*700637cbSDimitry Andric           Operand.setOffset(Operand.getOffset() + NewBytes);
110*700637cbSDimitry Andric       }
111*700637cbSDimitry Andric       NextMI++;
112*700637cbSDimitry Andric     }
113*700637cbSDimitry Andric   }
114*700637cbSDimitry Andric 
115*700637cbSDimitry Andric   struct HazardState {
116*700637cbSDimitry Andric     static constexpr unsigned None = 0;
117*700637cbSDimitry Andric     static constexpr unsigned SALU = (1 << 0);
118*700637cbSDimitry Andric     static constexpr unsigned VALU = (1 << 1);
119*700637cbSDimitry Andric 
120*700637cbSDimitry Andric     std::bitset<64> Tracked;      // SGPR banks ever read by VALU
121*700637cbSDimitry Andric     std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
122*700637cbSDimitry Andric     std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
123*700637cbSDimitry Andric     unsigned VCCHazard = None;    // Source of current VCC writes
124*700637cbSDimitry Andric     bool ActiveFlat = false;      // Has unwaited flat instructions
125*700637cbSDimitry Andric 
merge__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState126*700637cbSDimitry Andric     bool merge(const HazardState &RHS) {
127*700637cbSDimitry Andric       HazardState Orig(*this);
128*700637cbSDimitry Andric       *this |= RHS;
129*700637cbSDimitry Andric       return (*this != Orig);
130*700637cbSDimitry Andric     }
131*700637cbSDimitry Andric 
operator ==__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState132*700637cbSDimitry Andric     bool operator==(const HazardState &RHS) const {
133*700637cbSDimitry Andric       return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
134*700637cbSDimitry Andric              VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
135*700637cbSDimitry Andric              ActiveFlat == RHS.ActiveFlat;
136*700637cbSDimitry Andric     }
137*700637cbSDimitry Andric 
operator !=__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState138*700637cbSDimitry Andric     bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }
139*700637cbSDimitry Andric 
operator |=__anonce47010f0111::AMDGPUWaitSGPRHazards::HazardState140*700637cbSDimitry Andric     void operator|=(const HazardState &RHS) {
141*700637cbSDimitry Andric       Tracked |= RHS.Tracked;
142*700637cbSDimitry Andric       SALUHazards |= RHS.SALUHazards;
143*700637cbSDimitry Andric       VALUHazards |= RHS.VALUHazards;
144*700637cbSDimitry Andric       VCCHazard |= RHS.VCCHazard;
145*700637cbSDimitry Andric       ActiveFlat |= RHS.ActiveFlat;
146*700637cbSDimitry Andric     }
147*700637cbSDimitry Andric   };
148*700637cbSDimitry Andric 
149*700637cbSDimitry Andric   struct BlockHazardState {
150*700637cbSDimitry Andric     HazardState In;
151*700637cbSDimitry Andric     HazardState Out;
152*700637cbSDimitry Andric   };
153*700637cbSDimitry Andric 
154*700637cbSDimitry Andric   DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
155*700637cbSDimitry Andric 
156*700637cbSDimitry Andric   static constexpr unsigned WAVE32_NOPS = 4;
157*700637cbSDimitry Andric   static constexpr unsigned WAVE64_NOPS = 8;
158*700637cbSDimitry Andric 
insertHazardCull(MachineBasicBlock & MBB,MachineBasicBlock::instr_iterator & MI)159*700637cbSDimitry Andric   void insertHazardCull(MachineBasicBlock &MBB,
160*700637cbSDimitry Andric                         MachineBasicBlock::instr_iterator &MI) {
161*700637cbSDimitry Andric     assert(!MI->isBundled());
162*700637cbSDimitry Andric     unsigned Count = DsNopCount;
163*700637cbSDimitry Andric     while (Count--)
164*700637cbSDimitry Andric       BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
165*700637cbSDimitry Andric   }
166*700637cbSDimitry Andric 
mergeMasks(unsigned Mask1,unsigned Mask2)167*700637cbSDimitry Andric   unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
168*700637cbSDimitry Andric     unsigned Mask = 0xffff;
169*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
170*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
171*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
172*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
173*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
174*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
175*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
176*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
177*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
178*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
179*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
180*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
181*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
182*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
183*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
184*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
185*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
186*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
187*700637cbSDimitry Andric     Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
188*700637cbSDimitry Andric         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
189*700637cbSDimitry Andric                        AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
190*700637cbSDimitry Andric     return Mask;
191*700637cbSDimitry Andric   }
192*700637cbSDimitry Andric 
mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator & MI,unsigned Mask)193*700637cbSDimitry Andric   bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
194*700637cbSDimitry Andric                                 unsigned Mask) {
195*700637cbSDimitry Andric     auto MBB = MI->getParent();
196*700637cbSDimitry Andric     if (MI == MBB->instr_begin())
197*700637cbSDimitry Andric       return false;
198*700637cbSDimitry Andric 
199*700637cbSDimitry Andric     auto It = prev_nodbg(MI, MBB->instr_begin());
200*700637cbSDimitry Andric     if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
201*700637cbSDimitry Andric       return false;
202*700637cbSDimitry Andric 
203*700637cbSDimitry Andric     It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
204*700637cbSDimitry Andric     return true;
205*700637cbSDimitry Andric   }
206*700637cbSDimitry Andric 
runOnMachineBasicBlock(MachineBasicBlock & MBB,bool Emit)207*700637cbSDimitry Andric   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
208*700637cbSDimitry Andric     enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
209*700637cbSDimitry Andric 
210*700637cbSDimitry Andric     HazardState State = BlockState[&MBB].In;
211*700637cbSDimitry Andric     SmallSet<Register, 8> SeenRegs;
212*700637cbSDimitry Andric     bool Emitted = false;
213*700637cbSDimitry Andric     unsigned DsNops = 0;
214*700637cbSDimitry Andric 
215*700637cbSDimitry Andric     for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
216*700637cbSDimitry Andric                                            E = MBB.instr_end();
217*700637cbSDimitry Andric          MI != E; ++MI) {
218*700637cbSDimitry Andric       if (MI->isMetaInstruction())
219*700637cbSDimitry Andric         continue;
220*700637cbSDimitry Andric 
221*700637cbSDimitry Andric       // Clear tracked SGPRs if sufficient DS_NOPs occur
222*700637cbSDimitry Andric       if (MI->getOpcode() == AMDGPU::DS_NOP) {
223*700637cbSDimitry Andric         if (++DsNops >= DsNopCount)
224*700637cbSDimitry Andric           State.Tracked.reset();
225*700637cbSDimitry Andric         continue;
226*700637cbSDimitry Andric       }
227*700637cbSDimitry Andric       DsNops = 0;
228*700637cbSDimitry Andric 
229*700637cbSDimitry Andric       // Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
230*700637cbSDimitry Andric       // Culls could be disproportionate in cost to load time.
231*700637cbSDimitry Andric       if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI))
232*700637cbSDimitry Andric         State.ActiveFlat = true;
233*700637cbSDimitry Andric 
234*700637cbSDimitry Andric       // SMEM or VMEM clears hazards
235*700637cbSDimitry Andric       // FIXME: adapt to add FLAT without VALU (so !isLDSDMA())?
236*700637cbSDimitry Andric       if ((SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI)) ||
237*700637cbSDimitry Andric           SIInstrInfo::isSMRD(*MI)) {
238*700637cbSDimitry Andric         State.VCCHazard = HazardState::None;
239*700637cbSDimitry Andric         State.SALUHazards.reset();
240*700637cbSDimitry Andric         State.VALUHazards.reset();
241*700637cbSDimitry Andric         continue;
242*700637cbSDimitry Andric       }
243*700637cbSDimitry Andric 
244*700637cbSDimitry Andric       // Existing S_WAITALU can clear hazards
245*700637cbSDimitry Andric       if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
246*700637cbSDimitry Andric         unsigned int Mask = MI->getOperand(0).getImm();
247*700637cbSDimitry Andric         if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
248*700637cbSDimitry Andric           State.VCCHazard &= ~HazardState::VALU;
249*700637cbSDimitry Andric         if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
250*700637cbSDimitry Andric           State.SALUHazards.reset();
251*700637cbSDimitry Andric           State.VCCHazard &= ~HazardState::SALU;
252*700637cbSDimitry Andric         }
253*700637cbSDimitry Andric         if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
254*700637cbSDimitry Andric           State.VALUHazards.reset();
255*700637cbSDimitry Andric         continue;
256*700637cbSDimitry Andric       }
257*700637cbSDimitry Andric 
258*700637cbSDimitry Andric       // Snoop counter waits to insert culls
259*700637cbSDimitry Andric       if (CullSGPRHazardsAtMemWait &&
260*700637cbSDimitry Andric           (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
261*700637cbSDimitry Andric            MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
262*700637cbSDimitry Andric            MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
263*700637cbSDimitry Andric           (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
264*700637cbSDimitry Andric           (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
265*700637cbSDimitry Andric         if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
266*700637cbSDimitry Andric           State.ActiveFlat = false;
267*700637cbSDimitry Andric         } else {
268*700637cbSDimitry Andric           State.Tracked.reset();
269*700637cbSDimitry Andric           if (Emit)
270*700637cbSDimitry Andric             insertHazardCull(MBB, MI);
271*700637cbSDimitry Andric           continue;
272*700637cbSDimitry Andric         }
273*700637cbSDimitry Andric       }
274*700637cbSDimitry Andric 
275*700637cbSDimitry Andric       // Process only VALUs and SALUs
276*700637cbSDimitry Andric       bool IsVALU = SIInstrInfo::isVALU(*MI);
277*700637cbSDimitry Andric       bool IsSALU = SIInstrInfo::isSALU(*MI);
278*700637cbSDimitry Andric       if (!IsVALU && !IsSALU)
279*700637cbSDimitry Andric         continue;
280*700637cbSDimitry Andric 
281*700637cbSDimitry Andric       unsigned Wait = 0;
282*700637cbSDimitry Andric 
283*700637cbSDimitry Andric       auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
284*700637cbSDimitry Andric         if (!Op.isReg())
285*700637cbSDimitry Andric           return;
286*700637cbSDimitry Andric         Register Reg = Op.getReg();
287*700637cbSDimitry Andric         assert(!Op.getSubReg());
288*700637cbSDimitry Andric         if (!TRI->isSGPRReg(*MRI, Reg))
289*700637cbSDimitry Andric           return;
290*700637cbSDimitry Andric 
291*700637cbSDimitry Andric         // Only visit each register once
292*700637cbSDimitry Andric         if (!SeenRegs.insert(Reg).second)
293*700637cbSDimitry Andric           return;
294*700637cbSDimitry Andric 
295*700637cbSDimitry Andric         auto RegNumber = sgprNumber(Reg, *TRI);
296*700637cbSDimitry Andric         if (!RegNumber)
297*700637cbSDimitry Andric           return;
298*700637cbSDimitry Andric 
299*700637cbSDimitry Andric         // Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
300*700637cbSDimitry Andric         // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
301*700637cbSDimitry Andric         unsigned RegN = *RegNumber;
302*700637cbSDimitry Andric         unsigned PairN = (RegN >> 1) & 0x3f;
303*700637cbSDimitry Andric 
304*700637cbSDimitry Andric         // Read/write of untracked register is safe; but must record any new
305*700637cbSDimitry Andric         // reads.
306*700637cbSDimitry Andric         if (!State.Tracked[PairN]) {
307*700637cbSDimitry Andric           if (IsVALU && IsUse)
308*700637cbSDimitry Andric             State.Tracked.set(PairN);
309*700637cbSDimitry Andric           return;
310*700637cbSDimitry Andric         }
311*700637cbSDimitry Andric 
312*700637cbSDimitry Andric         uint8_t SGPRCount =
313*700637cbSDimitry Andric             AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32;
314*700637cbSDimitry Andric 
315*700637cbSDimitry Andric         if (IsUse) {
316*700637cbSDimitry Andric           // SALU reading SGPR clears VALU hazards
317*700637cbSDimitry Andric           if (IsSALU) {
318*700637cbSDimitry Andric             if (isVCC(Reg)) {
319*700637cbSDimitry Andric               if (State.VCCHazard & HazardState::VALU)
320*700637cbSDimitry Andric                 State.VCCHazard = HazardState::None;
321*700637cbSDimitry Andric             } else {
322*700637cbSDimitry Andric               State.VALUHazards.reset();
323*700637cbSDimitry Andric             }
324*700637cbSDimitry Andric           }
325*700637cbSDimitry Andric           // Compute required waits
326*700637cbSDimitry Andric           for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
327*700637cbSDimitry Andric             Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
328*700637cbSDimitry Andric             Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
329*700637cbSDimitry Andric           }
330*700637cbSDimitry Andric           if (isVCC(Reg) && State.VCCHazard) {
331*700637cbSDimitry Andric             // Note: it's possible for both SALU and VALU to exist if VCC
332*700637cbSDimitry Andric             // was updated differently by merged predecessors.
333*700637cbSDimitry Andric             if (State.VCCHazard & HazardState::SALU)
334*700637cbSDimitry Andric               Wait |= WA_SALU;
335*700637cbSDimitry Andric             if (State.VCCHazard & HazardState::VALU)
336*700637cbSDimitry Andric               Wait |= WA_VCC;
337*700637cbSDimitry Andric           }
338*700637cbSDimitry Andric         } else {
339*700637cbSDimitry Andric           // Update hazards
340*700637cbSDimitry Andric           if (isVCC(Reg)) {
341*700637cbSDimitry Andric             State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
342*700637cbSDimitry Andric           } else {
343*700637cbSDimitry Andric             for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
344*700637cbSDimitry Andric               if (IsSALU)
345*700637cbSDimitry Andric                 State.SALUHazards.set(RegN + RegIdx);
346*700637cbSDimitry Andric               else
347*700637cbSDimitry Andric                 State.VALUHazards.set(RegN + RegIdx);
348*700637cbSDimitry Andric             }
349*700637cbSDimitry Andric           }
350*700637cbSDimitry Andric         }
351*700637cbSDimitry Andric       };
352*700637cbSDimitry Andric 
353*700637cbSDimitry Andric       const bool IsSetPC =
354*700637cbSDimitry Andric           (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) &&
355*700637cbSDimitry Andric           MI->getOpcode() != AMDGPU::S_ENDPGM &&
356*700637cbSDimitry Andric           MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
357*700637cbSDimitry Andric 
358*700637cbSDimitry Andric       // Only consider implicit VCC specified by instruction descriptor.
359*700637cbSDimitry Andric       const bool HasImplicitVCC =
360*700637cbSDimitry Andric           llvm::any_of(MI->getDesc().implicit_uses(), isVCC) ||
361*700637cbSDimitry Andric           llvm::any_of(MI->getDesc().implicit_defs(), isVCC);
362*700637cbSDimitry Andric 
363*700637cbSDimitry Andric       if (IsSetPC) {
364*700637cbSDimitry Andric         // All SGPR writes before a call/return must be flushed as the
365*700637cbSDimitry Andric         // callee/caller will not will not see the hazard chain.
366*700637cbSDimitry Andric         if (State.VCCHazard & HazardState::VALU)
367*700637cbSDimitry Andric           Wait |= WA_VCC;
368*700637cbSDimitry Andric         if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
369*700637cbSDimitry Andric           Wait |= WA_SALU;
370*700637cbSDimitry Andric         if (State.VALUHazards.any())
371*700637cbSDimitry Andric           Wait |= WA_VALU;
372*700637cbSDimitry Andric         if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
373*700637cbSDimitry Andric           State.Tracked.reset();
374*700637cbSDimitry Andric           if (Emit)
375*700637cbSDimitry Andric             insertHazardCull(MBB, MI);
376*700637cbSDimitry Andric         }
377*700637cbSDimitry Andric       } else {
378*700637cbSDimitry Andric         // Process uses to determine required wait.
379*700637cbSDimitry Andric         SeenRegs.clear();
380*700637cbSDimitry Andric         for (const MachineOperand &Op : MI->all_uses()) {
381*700637cbSDimitry Andric           if (Op.isImplicit() &&
382*700637cbSDimitry Andric               (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
383*700637cbSDimitry Andric             continue;
384*700637cbSDimitry Andric           processOperand(Op, true);
385*700637cbSDimitry Andric         }
386*700637cbSDimitry Andric       }
387*700637cbSDimitry Andric 
388*700637cbSDimitry Andric       // Apply wait
389*700637cbSDimitry Andric       if (Wait) {
390*700637cbSDimitry Andric         unsigned Mask = 0xffff;
391*700637cbSDimitry Andric         if (Wait & WA_VCC) {
392*700637cbSDimitry Andric           State.VCCHazard &= ~HazardState::VALU;
393*700637cbSDimitry Andric           Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
394*700637cbSDimitry Andric         }
395*700637cbSDimitry Andric         if (Wait & WA_SALU) {
396*700637cbSDimitry Andric           State.SALUHazards.reset();
397*700637cbSDimitry Andric           State.VCCHazard &= ~HazardState::SALU;
398*700637cbSDimitry Andric           Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
399*700637cbSDimitry Andric         }
400*700637cbSDimitry Andric         if (Wait & WA_VALU) {
401*700637cbSDimitry Andric           State.VALUHazards.reset();
402*700637cbSDimitry Andric           Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
403*700637cbSDimitry Andric         }
404*700637cbSDimitry Andric         if (Emit) {
405*700637cbSDimitry Andric           if (!mergeConsecutiveWaitAlus(MI, Mask)) {
406*700637cbSDimitry Andric             auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
407*700637cbSDimitry Andric                                  TII->get(AMDGPU::S_WAITCNT_DEPCTR))
408*700637cbSDimitry Andric                              .addImm(Mask);
409*700637cbSDimitry Andric             updateGetPCBundle(NewMI);
410*700637cbSDimitry Andric           }
411*700637cbSDimitry Andric           Emitted = true;
412*700637cbSDimitry Andric         }
413*700637cbSDimitry Andric       }
414*700637cbSDimitry Andric 
415*700637cbSDimitry Andric       // On return from a call SGPR state is unknown, so all potential hazards.
416*700637cbSDimitry Andric       if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
417*700637cbSDimitry Andric         State.Tracked.set();
418*700637cbSDimitry Andric 
419*700637cbSDimitry Andric       // Update hazards based on defs.
420*700637cbSDimitry Andric       SeenRegs.clear();
421*700637cbSDimitry Andric       for (const MachineOperand &Op : MI->all_defs()) {
422*700637cbSDimitry Andric         if (Op.isImplicit() &&
423*700637cbSDimitry Andric             (!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
424*700637cbSDimitry Andric           continue;
425*700637cbSDimitry Andric         processOperand(Op, false);
426*700637cbSDimitry Andric       }
427*700637cbSDimitry Andric     }
428*700637cbSDimitry Andric 
429*700637cbSDimitry Andric     BlockHazardState &BS = BlockState[&MBB];
430*700637cbSDimitry Andric     bool Changed = State != BS.Out;
431*700637cbSDimitry Andric     if (Emit) {
432*700637cbSDimitry Andric       assert(!Changed && "Hazard state should not change on emit pass");
433*700637cbSDimitry Andric       return Emitted;
434*700637cbSDimitry Andric     }
435*700637cbSDimitry Andric     if (Changed)
436*700637cbSDimitry Andric       BS.Out = State;
437*700637cbSDimitry Andric     return Changed;
438*700637cbSDimitry Andric   }
439*700637cbSDimitry Andric 
run(MachineFunction & MF)440*700637cbSDimitry Andric   bool run(MachineFunction &MF) {
441*700637cbSDimitry Andric     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
442*700637cbSDimitry Andric     if (!ST.hasVALUReadSGPRHazard())
443*700637cbSDimitry Andric       return false;
444*700637cbSDimitry Andric 
445*700637cbSDimitry Andric     // Parse settings
446*700637cbSDimitry Andric     EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
447*700637cbSDimitry Andric     CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
448*700637cbSDimitry Andric     CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
449*700637cbSDimitry Andric     CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;
450*700637cbSDimitry Andric 
451*700637cbSDimitry Andric     if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
452*700637cbSDimitry Andric       EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
453*700637cbSDimitry Andric           "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
454*700637cbSDimitry Andric     if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
455*700637cbSDimitry Andric       CullSGPRHazardsOnFunctionBoundary =
456*700637cbSDimitry Andric           MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
457*700637cbSDimitry Andric     if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
458*700637cbSDimitry Andric       CullSGPRHazardsAtMemWait =
459*700637cbSDimitry Andric           MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
460*700637cbSDimitry Andric     if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
461*700637cbSDimitry Andric       CullSGPRHazardsMemWaitThreshold =
462*700637cbSDimitry Andric           MF.getFunction().getFnAttributeAsParsedInteger(
463*700637cbSDimitry Andric               "amdgpu-sgpr-hazard-mem-wait-cull-threshold",
464*700637cbSDimitry Andric               CullSGPRHazardsMemWaitThreshold);
465*700637cbSDimitry Andric 
466*700637cbSDimitry Andric     // Bail if disabled
467*700637cbSDimitry Andric     if (!EnableSGPRHazardWaits)
468*700637cbSDimitry Andric       return false;
469*700637cbSDimitry Andric 
470*700637cbSDimitry Andric     TII = ST.getInstrInfo();
471*700637cbSDimitry Andric     TRI = ST.getRegisterInfo();
472*700637cbSDimitry Andric     MRI = &MF.getRegInfo();
473*700637cbSDimitry Andric     DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
474*700637cbSDimitry Andric 
475*700637cbSDimitry Andric     auto CallingConv = MF.getFunction().getCallingConv();
476*700637cbSDimitry Andric     if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
477*700637cbSDimitry Andric         !CullSGPRHazardsOnFunctionBoundary) {
478*700637cbSDimitry Andric       // Callee must consider all SGPRs as tracked.
479*700637cbSDimitry Andric       LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
480*700637cbSDimitry Andric       MachineBasicBlock &EntryBlock = MF.front();
481*700637cbSDimitry Andric       BlockState[&EntryBlock].In.Tracked.set();
482*700637cbSDimitry Andric     }
483*700637cbSDimitry Andric 
484*700637cbSDimitry Andric     // Calculate the hazard state for each basic block.
485*700637cbSDimitry Andric     // Iterate until a fixed point is reached.
486*700637cbSDimitry Andric     // Fixed point is guaranteed as merge function only ever increases
487*700637cbSDimitry Andric     // the hazard set, and all backedges will cause a merge.
488*700637cbSDimitry Andric     //
489*700637cbSDimitry Andric     // Note: we have to take care of the entry block as this technically
490*700637cbSDimitry Andric     // has an edge from outside the function. Failure to treat this as
491*700637cbSDimitry Andric     // a merge could prevent fixed point being reached.
492*700637cbSDimitry Andric     SetVector<MachineBasicBlock *> Worklist;
493*700637cbSDimitry Andric     for (auto &MBB : reverse(MF))
494*700637cbSDimitry Andric       Worklist.insert(&MBB);
495*700637cbSDimitry Andric     while (!Worklist.empty()) {
496*700637cbSDimitry Andric       auto &MBB = *Worklist.pop_back_val();
497*700637cbSDimitry Andric       bool Changed = runOnMachineBasicBlock(MBB, false);
498*700637cbSDimitry Andric       if (Changed) {
499*700637cbSDimitry Andric         // Note: take a copy of state here in case it is reallocated by map
500*700637cbSDimitry Andric         HazardState NewState = BlockState[&MBB].Out;
501*700637cbSDimitry Andric         // Propagate to all successor blocks
502*700637cbSDimitry Andric         for (auto Succ : MBB.successors()) {
503*700637cbSDimitry Andric           // We only need to merge hazards at CFG merge points.
504*700637cbSDimitry Andric           auto &SuccState = BlockState[Succ];
505*700637cbSDimitry Andric           if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
506*700637cbSDimitry Andric             if (SuccState.In != NewState) {
507*700637cbSDimitry Andric               SuccState.In = NewState;
508*700637cbSDimitry Andric               Worklist.insert(Succ);
509*700637cbSDimitry Andric             }
510*700637cbSDimitry Andric           } else if (SuccState.In.merge(NewState)) {
511*700637cbSDimitry Andric             Worklist.insert(Succ);
512*700637cbSDimitry Andric           }
513*700637cbSDimitry Andric         }
514*700637cbSDimitry Andric       }
515*700637cbSDimitry Andric     }
516*700637cbSDimitry Andric 
517*700637cbSDimitry Andric     LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");
518*700637cbSDimitry Andric 
519*700637cbSDimitry Andric     // Final to emit wait instructions.
520*700637cbSDimitry Andric     bool Changed = false;
521*700637cbSDimitry Andric     for (auto &MBB : MF)
522*700637cbSDimitry Andric       Changed |= runOnMachineBasicBlock(MBB, true);
523*700637cbSDimitry Andric 
524*700637cbSDimitry Andric     BlockState.clear();
525*700637cbSDimitry Andric     return Changed;
526*700637cbSDimitry Andric   }
527*700637cbSDimitry Andric };
528*700637cbSDimitry Andric 
529*700637cbSDimitry Andric class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass {
530*700637cbSDimitry Andric public:
531*700637cbSDimitry Andric   static char ID;
532*700637cbSDimitry Andric 
AMDGPUWaitSGPRHazardsLegacy()533*700637cbSDimitry Andric   AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}
534*700637cbSDimitry Andric 
runOnMachineFunction(MachineFunction & MF)535*700637cbSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override {
536*700637cbSDimitry Andric     return AMDGPUWaitSGPRHazards().run(MF);
537*700637cbSDimitry Andric   }
538*700637cbSDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const539*700637cbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
540*700637cbSDimitry Andric     AU.setPreservesCFG();
541*700637cbSDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
542*700637cbSDimitry Andric   }
543*700637cbSDimitry Andric };
544*700637cbSDimitry Andric 
545*700637cbSDimitry Andric } // namespace
546*700637cbSDimitry Andric 
547*700637cbSDimitry Andric char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
548*700637cbSDimitry Andric 
549*700637cbSDimitry Andric char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID;
550*700637cbSDimitry Andric 
551*700637cbSDimitry Andric INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE,
552*700637cbSDimitry Andric                 "AMDGPU Insert waits for SGPR read hazards", false, false)
553*700637cbSDimitry Andric 
554*700637cbSDimitry Andric PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)555*700637cbSDimitry Andric AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
556*700637cbSDimitry Andric                                MachineFunctionAnalysisManager &MFAM) {
557*700637cbSDimitry Andric   if (AMDGPUWaitSGPRHazards().run(MF))
558*700637cbSDimitry Andric     return PreservedAnalyses::none();
559*700637cbSDimitry Andric   return PreservedAnalyses::all();
560*700637cbSDimitry Andric }
561