xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements methods from the AMDGPUCustomBehaviour class.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUCustomBehaviour.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIInstrInfo.h"
17 #include "TargetInfo/AMDGPUTargetInfo.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/WithColor.h"
20 
21 namespace llvm {
22 namespace mca {
23 
24 void AMDGPUInstrPostProcess::postProcessInstruction(
25     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26   switch (MCI.getOpcode()) {
27   case AMDGPU::S_WAITCNT:
28   case AMDGPU::S_WAITCNT_EXPCNT:
29   case AMDGPU::S_WAITCNT_LGKMCNT:
30   case AMDGPU::S_WAITCNT_VMCNT:
31   case AMDGPU::S_WAITCNT_VSCNT:
32   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
33   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
34   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
35   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
36   case AMDGPU::S_WAITCNT_gfx10:
37   case AMDGPU::S_WAITCNT_gfx6_gfx7:
38   case AMDGPU::S_WAITCNT_vi:
39     return processWaitCnt(Inst, MCI);
40   }
41 }
42 
43 // s_waitcnt instructions encode important information as immediate operands
44 // which are lost during the MCInst -> mca::Instruction lowering.
45 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
46                                             const MCInst &MCI) {
47   for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
48     MCAOperand Op;
49     const MCOperand &MCOp = MCI.getOperand(Idx);
50     if (MCOp.isReg()) {
51       Op = MCAOperand::createReg(MCOp.getReg());
52     } else if (MCOp.isImm()) {
53       Op = MCAOperand::createImm(MCOp.getImm());
54     }
55     Op.setIndex(Idx);
56     Inst->addOperand(Op);
57   }
58 }
59 
60 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
61                                              const mca::SourceMgr &SrcMgr,
62                                              const MCInstrInfo &MCII)
63     : CustomBehaviour(STI, SrcMgr, MCII) {
64   generateWaitCntInfo();
65 }
66 
67 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
68                                                   const InstRef &IR) {
69   const Instruction &Inst = *IR.getInstruction();
70   unsigned Opcode = Inst.getOpcode();
71 
72   // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
73   // pseudo instructions here. However, there are plans for the future to make
74   // it possible to use mca within backend passes. As such, I have left the
75   // pseudo version of s_waitcnt within this switch statement.
76   switch (Opcode) {
77   default:
78     return 0;
79   case AMDGPU::S_WAITCNT: // This instruction
80   case AMDGPU::S_WAITCNT_EXPCNT:
81   case AMDGPU::S_WAITCNT_LGKMCNT:
82   case AMDGPU::S_WAITCNT_VMCNT:
83   case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
84   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
85   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
86   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
87   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
88   case AMDGPU::S_WAITCNT_gfx10:
89   case AMDGPU::S_WAITCNT_gfx6_gfx7:
90   case AMDGPU::S_WAITCNT_vi:
91     // s_endpgm also behaves as if there is an implicit
92     // s_waitcnt 0, but I'm not sure if it would be appropriate
93     // to model this in llvm-mca based on how the iterations work
94     // while simulating the pipeline over and over.
95     return handleWaitCnt(IssuedInst, IR);
96   }
97 
98   return 0;
99 }
100 
101 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
102                                               const InstRef &IR) {
103   // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
104   // I do not know how that instruction works so I did not attempt to model it.
105   // set the max values to begin
106   unsigned Vmcnt = 63;
107   unsigned Expcnt = 7;
108   unsigned Lgkmcnt = 31;
109   unsigned Vscnt = 63;
110   unsigned CurrVmcnt = 0;
111   unsigned CurrExpcnt = 0;
112   unsigned CurrLgkmcnt = 0;
113   unsigned CurrVscnt = 0;
114   unsigned CyclesToWaitVm = ~0U;
115   unsigned CyclesToWaitExp = ~0U;
116   unsigned CyclesToWaitLgkm = ~0U;
117   unsigned CyclesToWaitVs = ~0U;
118 
119   computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
120 
121   // We will now look at each of the currently executing instructions
122   // to find out if this wait instruction still needs to wait.
123   for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
124     const InstRef &PrevIR = *I;
125     const Instruction &PrevInst = *PrevIR.getInstruction();
126     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
127     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
128     const int CyclesLeft = PrevInst.getCyclesLeft();
129     assert(CyclesLeft != UNKNOWN_CYCLES &&
130            "We should know how many cycles are left for this instruction");
131     if (PrevInstWaitInfo.VmCnt) {
132       CurrVmcnt++;
133       if ((unsigned)CyclesLeft < CyclesToWaitVm)
134         CyclesToWaitVm = CyclesLeft;
135     }
136     if (PrevInstWaitInfo.ExpCnt) {
137       CurrExpcnt++;
138       if ((unsigned)CyclesLeft < CyclesToWaitExp)
139         CyclesToWaitExp = CyclesLeft;
140     }
141     if (PrevInstWaitInfo.LgkmCnt) {
142       CurrLgkmcnt++;
143       if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
144         CyclesToWaitLgkm = CyclesLeft;
145     }
146     if (PrevInstWaitInfo.VsCnt) {
147       CurrVscnt++;
148       if ((unsigned)CyclesLeft < CyclesToWaitVs)
149         CyclesToWaitVs = CyclesLeft;
150     }
151   }
152 
153   unsigned CyclesToWait = ~0U;
154   if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
155     CyclesToWait = CyclesToWaitVm;
156   if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
157     CyclesToWait = CyclesToWaitExp;
158   if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
159     CyclesToWait = CyclesToWaitLgkm;
160   if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
161     CyclesToWait = CyclesToWaitVs;
162 
163   // We may underestimate how many cycles we need to wait, but this
164   // isn't a big deal. Our return value is just how many cycles until
165   // this function gets run again. So as long as we don't overestimate
166   // the wait time, we'll still end up stalling at this instruction
167   // for the correct number of cycles.
168 
169   if (CyclesToWait == ~0U)
170     return 0;
171   return CyclesToWait;
172 }
173 
174 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
175                                            unsigned &Expcnt, unsigned &Lgkmcnt,
176                                            unsigned &Vscnt) {
177   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
178   const Instruction &Inst = *IR.getInstruction();
179   unsigned Opcode = Inst.getOpcode();
180 
181   switch (Opcode) {
182   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
183   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
184   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
185   case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
186     // Should probably be checking for nullptr
187     // here, but I'm not sure how I should handle the case
188     // where we see a nullptr.
189     const MCAOperand *OpReg = Inst.getOperand(0);
190     const MCAOperand *OpImm = Inst.getOperand(1);
191     assert(OpReg && OpReg->isReg() && "First operand should be a register.");
192     assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
193     if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
194       // Instruction is using a real register.
195       // Since we can't know what value this register will have,
196       // we can't compute what the value of this wait should be.
197       WithColor::warning() << "The register component of "
198                            << MCII.getName(Opcode) << " will be completely "
199                            << "ignored. So the wait may not be accurate.\n";
200     }
201     switch (Opcode) {
202     // Redundant switch so I don't have to repeat the code above
203     // for each case. There are more clever ways to avoid this
204     // extra switch and anyone can feel free to implement one of them.
205     case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
206       Expcnt = OpImm->getImm();
207       break;
208     case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
209       Lgkmcnt = OpImm->getImm();
210       break;
211     case AMDGPU::S_WAITCNT_VMCNT_gfx10:
212       Vmcnt = OpImm->getImm();
213       break;
214     case AMDGPU::S_WAITCNT_VSCNT_gfx10:
215       Vscnt = OpImm->getImm();
216       break;
217     }
218     return;
219   }
220   case AMDGPU::S_WAITCNT_gfx10:
221   case AMDGPU::S_WAITCNT_gfx6_gfx7:
222   case AMDGPU::S_WAITCNT_vi:
223     unsigned WaitCnt = Inst.getOperand(0)->getImm();
224     AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
225     return;
226   }
227 }
228 
229 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
230   // The core logic from this function is taken from
231   // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
232   // that are being looked at are in the MachineInstr format, whereas we have
233   // access to the MCInst format. The side effects of this are that we can't use
234   // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
235   // functions. Therefore, we conservatively assume that these functions will
236   // return true. This may cause a few instructions to be incorrectly tagged
237   // with an extra CNT. However, these are instructions that do interact with at
238   // least one CNT so giving them an extra CNT shouldn't cause issues in most
239   // scenarios.
240   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
241   InstrWaitCntInfo.resize(SrcMgr.size());
242 
243   int Index = 0;
244   for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
245     const std::unique_ptr<Instruction> &Inst = *I;
246     unsigned Opcode = Inst->getOpcode();
247     const MCInstrDesc &MCID = MCII.get(Opcode);
248     if ((MCID.TSFlags & SIInstrFlags::DS) &&
249         (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
250       InstrWaitCntInfo[Index].LgkmCnt = true;
251       if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
252         InstrWaitCntInfo[Index].ExpCnt = true;
253     } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
254       // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
255       // and mayAccessLDSThroughFlat(Inst) would both return true for this
256       // instruction. We have to do this because those functions use
257       // information about the memory operands that we don't have access to.
258       InstrWaitCntInfo[Index].LgkmCnt = true;
259       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
260         InstrWaitCntInfo[Index].VmCnt = true;
261       else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
262         InstrWaitCntInfo[Index].VmCnt = true;
263       else
264         InstrWaitCntInfo[Index].VsCnt = true;
265     } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
266       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
267         InstrWaitCntInfo[Index].VmCnt = true;
268       else if ((MCID.mayLoad() &&
269                 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
270                ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
271                 !MCID.mayStore()))
272         InstrWaitCntInfo[Index].VmCnt = true;
273       else if (MCID.mayStore())
274         InstrWaitCntInfo[Index].VsCnt = true;
275 
276       // (IV.Major < 7) is meant to represent
277       // GCNTarget.vmemWriteNeedsExpWaitcnt()
278       // which is defined as
279       // { return getGeneration() < SEA_ISLANDS; }
280       if (IV.Major < 7 &&
281           (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
282         InstrWaitCntInfo[Index].ExpCnt = true;
283     } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
284       InstrWaitCntInfo[Index].LgkmCnt = true;
285     } else if (MCID.TSFlags & SIInstrFlags::EXP) {
286       InstrWaitCntInfo[Index].ExpCnt = true;
287     } else {
288       switch (Opcode) {
289       case AMDGPU::S_SENDMSG:
290       case AMDGPU::S_SENDMSGHALT:
291       case AMDGPU::S_MEMTIME:
292       case AMDGPU::S_MEMREALTIME:
293         InstrWaitCntInfo[Index].LgkmCnt = true;
294         break;
295       }
296     }
297   }
298 }
299 
300 // taken from SIInstrInfo::isVMEM()
301 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
302   return MCID.TSFlags & SIInstrFlags::MUBUF ||
303          MCID.TSFlags & SIInstrFlags::MTBUF ||
304          MCID.TSFlags & SIInstrFlags::MIMG;
305 }
306 
307 // taken from SIInstrInfo::hasModifiersSet()
308 bool AMDGPUCustomBehaviour::hasModifiersSet(
309     const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
310   int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
311   if (Idx == -1)
312     return false;
313 
314   const MCAOperand *Op = Inst->getOperand(Idx);
315   if (Op == nullptr || !Op->isImm() || !Op->getImm())
316     return false;
317 
318   return true;
319 }
320 
321 // taken from SIInstrInfo::isAlwaysGDS()
322 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
323   return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
324          Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
325          Opcode == AMDGPU::DS_GWS_SEMA_P ||
326          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
327          Opcode == AMDGPU::DS_GWS_BARRIER;
328 }
329 
330 } // namespace mca
331 } // namespace llvm
332 
333 using namespace llvm;
334 using namespace mca;
335 
336 static CustomBehaviour *
337 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
338                             const mca::SourceMgr &SrcMgr,
339                             const MCInstrInfo &MCII) {
340   return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
341 }
342 
343 static InstrPostProcess *
344 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
345                              const MCInstrInfo &MCII) {
346   return new AMDGPUInstrPostProcess(STI, MCII);
347 }
348 
349 /// Extern function to initialize the targets for the AMDGPU backend
350 
351 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
352   TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
353                                           createAMDGPUCustomBehaviour);
354   TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
355                                            createAMDGPUInstrPostProcess);
356 
357   TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
358                                           createAMDGPUCustomBehaviour);
359   TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
360                                            createAMDGPUInstrPostProcess);
361 }
362