xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements methods from the AMDGPUCustomBehaviour class.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUCustomBehaviour.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "TargetInfo/AMDGPUTargetInfo.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/WithColor.h"
20 
21 namespace llvm::mca {
22 
postProcessInstruction(std::unique_ptr<Instruction> & Inst,const MCInst & MCI)23 void AMDGPUInstrPostProcess::postProcessInstruction(
24     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
25   switch (MCI.getOpcode()) {
26   case AMDGPU::S_WAITCNT:
27   case AMDGPU::S_WAITCNT_soft:
28   case AMDGPU::S_WAITCNT_EXPCNT:
29   case AMDGPU::S_WAITCNT_LGKMCNT:
30   case AMDGPU::S_WAITCNT_VMCNT:
31   case AMDGPU::S_WAITCNT_VSCNT:
32   case AMDGPU::S_WAITCNT_VSCNT_soft:
33   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
34   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
35   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
36   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
37   case AMDGPU::S_WAITCNT_gfx10:
38   case AMDGPU::S_WAITCNT_gfx6_gfx7:
39   case AMDGPU::S_WAITCNT_vi:
40     return processWaitCnt(Inst, MCI);
41   }
42 }
43 
44 // s_waitcnt instructions encode important information as immediate operands
45 // which are lost during the MCInst -> mca::Instruction lowering.
processWaitCnt(std::unique_ptr<Instruction> & Inst,const MCInst & MCI)46 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
47                                             const MCInst &MCI) {
48   for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
49     MCAOperand Op;
50     const MCOperand &MCOp = MCI.getOperand(Idx);
51     if (MCOp.isReg()) {
52       Op = MCAOperand::createReg(MCOp.getReg());
53     } else if (MCOp.isImm()) {
54       Op = MCAOperand::createImm(MCOp.getImm());
55     }
56     Op.setIndex(Idx);
57     Inst->addOperand(Op);
58   }
59 }
60 
AMDGPUCustomBehaviour(const MCSubtargetInfo & STI,const mca::SourceMgr & SrcMgr,const MCInstrInfo & MCII)61 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
62                                              const mca::SourceMgr &SrcMgr,
63                                              const MCInstrInfo &MCII)
64     : CustomBehaviour(STI, SrcMgr, MCII) {
65   generateWaitCntInfo();
66 }
67 
checkCustomHazard(ArrayRef<InstRef> IssuedInst,const InstRef & IR)68 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
69                                                   const InstRef &IR) {
70   const Instruction &Inst = *IR.getInstruction();
71   unsigned Opcode = Inst.getOpcode();
72 
73   // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
74   // pseudo instructions here. However, there are plans for the future to make
75   // it possible to use mca within backend passes. As such, I have left the
76   // pseudo version of s_waitcnt within this switch statement.
77   switch (Opcode) {
78   default:
79     return 0;
80   case AMDGPU::S_WAITCNT: // This instruction
81   case AMDGPU::S_WAITCNT_soft:
82   case AMDGPU::S_WAITCNT_EXPCNT:
83   case AMDGPU::S_WAITCNT_LGKMCNT:
84   case AMDGPU::S_WAITCNT_VMCNT:
85   case AMDGPU::S_WAITCNT_VSCNT:
86   case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
87   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
88   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
89   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
90   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
91   case AMDGPU::S_WAITCNT_gfx10:
92   case AMDGPU::S_WAITCNT_gfx6_gfx7:
93   case AMDGPU::S_WAITCNT_vi:
94     // s_endpgm also behaves as if there is an implicit
95     // s_waitcnt 0, but I'm not sure if it would be appropriate
96     // to model this in llvm-mca based on how the iterations work
97     // while simulating the pipeline over and over.
98     return handleWaitCnt(IssuedInst, IR);
99   }
100 
101   return 0;
102 }
103 
handleWaitCnt(ArrayRef<InstRef> IssuedInst,const InstRef & IR)104 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
105                                               const InstRef &IR) {
106   // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
107   // I do not know how that instruction works so I did not attempt to model it.
108   // set the max values to begin
109   unsigned Vmcnt = 63;
110   unsigned Expcnt = 7;
111   unsigned Lgkmcnt = 31;
112   unsigned Vscnt = 63;
113   unsigned CurrVmcnt = 0;
114   unsigned CurrExpcnt = 0;
115   unsigned CurrLgkmcnt = 0;
116   unsigned CurrVscnt = 0;
117   unsigned CyclesToWaitVm = ~0U;
118   unsigned CyclesToWaitExp = ~0U;
119   unsigned CyclesToWaitLgkm = ~0U;
120   unsigned CyclesToWaitVs = ~0U;
121 
122   computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
123 
124   // We will now look at each of the currently executing instructions
125   // to find out if this wait instruction still needs to wait.
126   for (const InstRef &PrevIR : IssuedInst) {
127     const Instruction &PrevInst = *PrevIR.getInstruction();
128     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
129     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
130     const int CyclesLeft = PrevInst.getCyclesLeft();
131     assert(CyclesLeft != UNKNOWN_CYCLES &&
132            "We should know how many cycles are left for this instruction");
133     if (PrevInstWaitInfo.VmCnt) {
134       CurrVmcnt++;
135       if ((unsigned)CyclesLeft < CyclesToWaitVm)
136         CyclesToWaitVm = CyclesLeft;
137     }
138     if (PrevInstWaitInfo.ExpCnt) {
139       CurrExpcnt++;
140       if ((unsigned)CyclesLeft < CyclesToWaitExp)
141         CyclesToWaitExp = CyclesLeft;
142     }
143     if (PrevInstWaitInfo.LgkmCnt) {
144       CurrLgkmcnt++;
145       if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
146         CyclesToWaitLgkm = CyclesLeft;
147     }
148     if (PrevInstWaitInfo.VsCnt) {
149       CurrVscnt++;
150       if ((unsigned)CyclesLeft < CyclesToWaitVs)
151         CyclesToWaitVs = CyclesLeft;
152     }
153   }
154 
155   unsigned CyclesToWait = ~0U;
156   if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
157     CyclesToWait = CyclesToWaitVm;
158   if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
159     CyclesToWait = CyclesToWaitExp;
160   if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
161     CyclesToWait = CyclesToWaitLgkm;
162   if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
163     CyclesToWait = CyclesToWaitVs;
164 
165   // We may underestimate how many cycles we need to wait, but this
166   // isn't a big deal. Our return value is just how many cycles until
167   // this function gets run again. So as long as we don't overestimate
168   // the wait time, we'll still end up stalling at this instruction
169   // for the correct number of cycles.
170 
171   if (CyclesToWait == ~0U)
172     return 0;
173   return CyclesToWait;
174 }
175 
computeWaitCnt(const InstRef & IR,unsigned & Vmcnt,unsigned & Expcnt,unsigned & Lgkmcnt,unsigned & Vscnt)176 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
177                                            unsigned &Expcnt, unsigned &Lgkmcnt,
178                                            unsigned &Vscnt) {
179   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
180   const Instruction &Inst = *IR.getInstruction();
181   unsigned Opcode = Inst.getOpcode();
182 
183   switch (Opcode) {
184   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
185   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
186   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
187   case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
188     // Should probably be checking for nullptr
189     // here, but I'm not sure how I should handle the case
190     // where we see a nullptr.
191     const MCAOperand *OpReg = Inst.getOperand(0);
192     const MCAOperand *OpImm = Inst.getOperand(1);
193     assert(OpReg && OpReg->isReg() && "First operand should be a register.");
194     assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
195     if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
196       // Instruction is using a real register.
197       // Since we can't know what value this register will have,
198       // we can't compute what the value of this wait should be.
199       WithColor::warning() << "The register component of "
200                            << MCII.getName(Opcode) << " will be completely "
201                            << "ignored. So the wait may not be accurate.\n";
202     }
203     switch (Opcode) {
204     // Redundant switch so I don't have to repeat the code above
205     // for each case. There are more clever ways to avoid this
206     // extra switch and anyone can feel free to implement one of them.
207     case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
208       Expcnt = OpImm->getImm();
209       break;
210     case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
211       Lgkmcnt = OpImm->getImm();
212       break;
213     case AMDGPU::S_WAITCNT_VMCNT_gfx10:
214       Vmcnt = OpImm->getImm();
215       break;
216     case AMDGPU::S_WAITCNT_VSCNT_gfx10:
217       Vscnt = OpImm->getImm();
218       break;
219     }
220     return;
221   }
222   case AMDGPU::S_WAITCNT_gfx10:
223   case AMDGPU::S_WAITCNT_gfx6_gfx7:
224   case AMDGPU::S_WAITCNT_vi:
225     unsigned WaitCnt = Inst.getOperand(0)->getImm();
226     AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
227     return;
228   }
229 }
230 
generateWaitCntInfo()231 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
232   // The core logic from this function is taken from
233   // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
234   // that are being looked at are in the MachineInstr format, whereas we have
235   // access to the MCInst format. The side effects of this are that we can't use
236   // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
237   // functions. Therefore, we conservatively assume that these functions will
238   // return true. This may cause a few instructions to be incorrectly tagged
239   // with an extra CNT. However, these are instructions that do interact with at
240   // least one CNT so giving them an extra CNT shouldn't cause issues in most
241   // scenarios.
242   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
243   InstrWaitCntInfo.resize(SrcMgr.size());
244 
245   for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
246     const std::unique_ptr<Instruction> &Inst = EN.value();
247     unsigned Index = EN.index();
248     unsigned Opcode = Inst->getOpcode();
249     const MCInstrDesc &MCID = MCII.get(Opcode);
250     if ((MCID.TSFlags & SIInstrFlags::DS) &&
251         (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
252       InstrWaitCntInfo[Index].LgkmCnt = true;
253       if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
254         InstrWaitCntInfo[Index].ExpCnt = true;
255     } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
256       // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
257       // and mayAccessLDSThroughFlat(Inst) would both return true for this
258       // instruction. We have to do this because those functions use
259       // information about the memory operands that we don't have access to.
260       InstrWaitCntInfo[Index].LgkmCnt = true;
261       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
262         InstrWaitCntInfo[Index].VmCnt = true;
263       else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
264         InstrWaitCntInfo[Index].VmCnt = true;
265       else
266         InstrWaitCntInfo[Index].VsCnt = true;
267     } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
268       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
269         InstrWaitCntInfo[Index].VmCnt = true;
270       else if ((MCID.mayLoad() &&
271                 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
272                ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
273                 !MCID.mayStore()))
274         InstrWaitCntInfo[Index].VmCnt = true;
275       else if (MCID.mayStore())
276         InstrWaitCntInfo[Index].VsCnt = true;
277 
278       // (IV.Major < 7) is meant to represent
279       // GCNTarget.vmemWriteNeedsExpWaitcnt()
280       // which is defined as
281       // { return getGeneration() < SEA_ISLANDS; }
282       if (IV.Major < 7 &&
283           (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
284         InstrWaitCntInfo[Index].ExpCnt = true;
285     } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
286       InstrWaitCntInfo[Index].LgkmCnt = true;
287     } else if (MCID.TSFlags & SIInstrFlags::EXP) {
288       InstrWaitCntInfo[Index].ExpCnt = true;
289     } else {
290       switch (Opcode) {
291       case AMDGPU::S_SENDMSG:
292       case AMDGPU::S_SENDMSGHALT:
293       case AMDGPU::S_MEMTIME:
294       case AMDGPU::S_MEMREALTIME:
295         InstrWaitCntInfo[Index].LgkmCnt = true;
296         break;
297       }
298     }
299   }
300 }
301 
302 // taken from SIInstrInfo::isVMEM()
isVMEM(const MCInstrDesc & MCID)303 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
304   return MCID.TSFlags & SIInstrFlags::MUBUF ||
305          MCID.TSFlags & SIInstrFlags::MTBUF ||
306          MCID.TSFlags & SIInstrFlags::MIMG;
307 }
308 
309 // taken from SIInstrInfo::hasModifiersSet()
hasModifiersSet(const std::unique_ptr<Instruction> & Inst,unsigned OpName) const310 bool AMDGPUCustomBehaviour::hasModifiersSet(
311     const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
312   int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
313   if (Idx == -1)
314     return false;
315 
316   const MCAOperand *Op = Inst->getOperand(Idx);
317   if (Op == nullptr || !Op->isImm() || !Op->getImm())
318     return false;
319 
320   return true;
321 }
322 
323 // taken from SIInstrInfo::isGWS()
isGWS(uint16_t Opcode) const324 bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
325   const MCInstrDesc &MCID = MCII.get(Opcode);
326   return MCID.TSFlags & SIInstrFlags::GWS;
327 }
328 
329 // taken from SIInstrInfo::isAlwaysGDS()
isAlwaysGDS(uint16_t Opcode) const330 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
331   return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
332 }
333 
334 } // namespace llvm::mca
335 
336 using namespace llvm;
337 using namespace mca;
338 
339 static CustomBehaviour *
createAMDGPUCustomBehaviour(const MCSubtargetInfo & STI,const mca::SourceMgr & SrcMgr,const MCInstrInfo & MCII)340 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
341                             const mca::SourceMgr &SrcMgr,
342                             const MCInstrInfo &MCII) {
343   return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
344 }
345 
346 static InstrPostProcess *
createAMDGPUInstrPostProcess(const MCSubtargetInfo & STI,const MCInstrInfo & MCII)347 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
348                              const MCInstrInfo &MCII) {
349   return new AMDGPUInstrPostProcess(STI, MCII);
350 }
351 
352 /// Extern function to initialize the targets for the AMDGPU backend
353 
LLVMInitializeAMDGPUTargetMCA()354 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
355   TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
356                                           createAMDGPUCustomBehaviour);
357   TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
358                                            createAMDGPUInstrPostProcess);
359 
360   TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
361                                           createAMDGPUCustomBehaviour);
362   TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
363                                            createAMDGPUInstrPostProcess);
364 }
365