1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements methods from the AMDGPUCustomBehaviour class.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUCustomBehaviour.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "TargetInfo/AMDGPUTargetInfo.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/WithColor.h"
20
21 namespace llvm::mca {
22
postProcessInstruction(std::unique_ptr<Instruction> & Inst,const MCInst & MCI)23 void AMDGPUInstrPostProcess::postProcessInstruction(
24 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
25 switch (MCI.getOpcode()) {
26 case AMDGPU::S_WAITCNT:
27 case AMDGPU::S_WAITCNT_soft:
28 case AMDGPU::S_WAITCNT_EXPCNT:
29 case AMDGPU::S_WAITCNT_LGKMCNT:
30 case AMDGPU::S_WAITCNT_VMCNT:
31 case AMDGPU::S_WAITCNT_VSCNT:
32 case AMDGPU::S_WAITCNT_VSCNT_soft:
33 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
34 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
35 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
36 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
37 case AMDGPU::S_WAITCNT_gfx10:
38 case AMDGPU::S_WAITCNT_gfx6_gfx7:
39 case AMDGPU::S_WAITCNT_vi:
40 return processWaitCnt(Inst, MCI);
41 }
42 }
43
44 // s_waitcnt instructions encode important information as immediate operands
45 // which are lost during the MCInst -> mca::Instruction lowering.
processWaitCnt(std::unique_ptr<Instruction> & Inst,const MCInst & MCI)46 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
47 const MCInst &MCI) {
48 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
49 MCAOperand Op;
50 const MCOperand &MCOp = MCI.getOperand(Idx);
51 if (MCOp.isReg()) {
52 Op = MCAOperand::createReg(MCOp.getReg());
53 } else if (MCOp.isImm()) {
54 Op = MCAOperand::createImm(MCOp.getImm());
55 }
56 Op.setIndex(Idx);
57 Inst->addOperand(Op);
58 }
59 }
60
AMDGPUCustomBehaviour(const MCSubtargetInfo & STI,const mca::SourceMgr & SrcMgr,const MCInstrInfo & MCII)61 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
62 const mca::SourceMgr &SrcMgr,
63 const MCInstrInfo &MCII)
64 : CustomBehaviour(STI, SrcMgr, MCII) {
65 generateWaitCntInfo();
66 }
67
checkCustomHazard(ArrayRef<InstRef> IssuedInst,const InstRef & IR)68 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
69 const InstRef &IR) {
70 const Instruction &Inst = *IR.getInstruction();
71 unsigned Opcode = Inst.getOpcode();
72
73 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
74 // pseudo instructions here. However, there are plans for the future to make
75 // it possible to use mca within backend passes. As such, I have left the
76 // pseudo version of s_waitcnt within this switch statement.
77 switch (Opcode) {
78 default:
79 return 0;
80 case AMDGPU::S_WAITCNT: // This instruction
81 case AMDGPU::S_WAITCNT_soft:
82 case AMDGPU::S_WAITCNT_EXPCNT:
83 case AMDGPU::S_WAITCNT_LGKMCNT:
84 case AMDGPU::S_WAITCNT_VMCNT:
85 case AMDGPU::S_WAITCNT_VSCNT:
86 case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
87 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
88 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
89 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
90 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
91 case AMDGPU::S_WAITCNT_gfx10:
92 case AMDGPU::S_WAITCNT_gfx6_gfx7:
93 case AMDGPU::S_WAITCNT_vi:
94 // s_endpgm also behaves as if there is an implicit
95 // s_waitcnt 0, but I'm not sure if it would be appropriate
96 // to model this in llvm-mca based on how the iterations work
97 // while simulating the pipeline over and over.
98 return handleWaitCnt(IssuedInst, IR);
99 }
100
101 return 0;
102 }
103
handleWaitCnt(ArrayRef<InstRef> IssuedInst,const InstRef & IR)104 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
105 const InstRef &IR) {
106 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
107 // I do not know how that instruction works so I did not attempt to model it.
108 // set the max values to begin
109 unsigned Vmcnt = 63;
110 unsigned Expcnt = 7;
111 unsigned Lgkmcnt = 31;
112 unsigned Vscnt = 63;
113 unsigned CurrVmcnt = 0;
114 unsigned CurrExpcnt = 0;
115 unsigned CurrLgkmcnt = 0;
116 unsigned CurrVscnt = 0;
117 unsigned CyclesToWaitVm = ~0U;
118 unsigned CyclesToWaitExp = ~0U;
119 unsigned CyclesToWaitLgkm = ~0U;
120 unsigned CyclesToWaitVs = ~0U;
121
122 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
123
124 // We will now look at each of the currently executing instructions
125 // to find out if this wait instruction still needs to wait.
126 for (const InstRef &PrevIR : IssuedInst) {
127 const Instruction &PrevInst = *PrevIR.getInstruction();
128 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
129 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
130 const int CyclesLeft = PrevInst.getCyclesLeft();
131 assert(CyclesLeft != UNKNOWN_CYCLES &&
132 "We should know how many cycles are left for this instruction");
133 if (PrevInstWaitInfo.VmCnt) {
134 CurrVmcnt++;
135 if ((unsigned)CyclesLeft < CyclesToWaitVm)
136 CyclesToWaitVm = CyclesLeft;
137 }
138 if (PrevInstWaitInfo.ExpCnt) {
139 CurrExpcnt++;
140 if ((unsigned)CyclesLeft < CyclesToWaitExp)
141 CyclesToWaitExp = CyclesLeft;
142 }
143 if (PrevInstWaitInfo.LgkmCnt) {
144 CurrLgkmcnt++;
145 if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
146 CyclesToWaitLgkm = CyclesLeft;
147 }
148 if (PrevInstWaitInfo.VsCnt) {
149 CurrVscnt++;
150 if ((unsigned)CyclesLeft < CyclesToWaitVs)
151 CyclesToWaitVs = CyclesLeft;
152 }
153 }
154
155 unsigned CyclesToWait = ~0U;
156 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
157 CyclesToWait = CyclesToWaitVm;
158 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
159 CyclesToWait = CyclesToWaitExp;
160 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
161 CyclesToWait = CyclesToWaitLgkm;
162 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
163 CyclesToWait = CyclesToWaitVs;
164
165 // We may underestimate how many cycles we need to wait, but this
166 // isn't a big deal. Our return value is just how many cycles until
167 // this function gets run again. So as long as we don't overestimate
168 // the wait time, we'll still end up stalling at this instruction
169 // for the correct number of cycles.
170
171 if (CyclesToWait == ~0U)
172 return 0;
173 return CyclesToWait;
174 }
175
computeWaitCnt(const InstRef & IR,unsigned & Vmcnt,unsigned & Expcnt,unsigned & Lgkmcnt,unsigned & Vscnt)176 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
177 unsigned &Expcnt, unsigned &Lgkmcnt,
178 unsigned &Vscnt) {
179 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
180 const Instruction &Inst = *IR.getInstruction();
181 unsigned Opcode = Inst.getOpcode();
182
183 switch (Opcode) {
184 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
185 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
186 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
187 case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
188 // Should probably be checking for nullptr
189 // here, but I'm not sure how I should handle the case
190 // where we see a nullptr.
191 const MCAOperand *OpReg = Inst.getOperand(0);
192 const MCAOperand *OpImm = Inst.getOperand(1);
193 assert(OpReg && OpReg->isReg() && "First operand should be a register.");
194 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
195 if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
196 // Instruction is using a real register.
197 // Since we can't know what value this register will have,
198 // we can't compute what the value of this wait should be.
199 WithColor::warning() << "The register component of "
200 << MCII.getName(Opcode) << " will be completely "
201 << "ignored. So the wait may not be accurate.\n";
202 }
203 switch (Opcode) {
204 // Redundant switch so I don't have to repeat the code above
205 // for each case. There are more clever ways to avoid this
206 // extra switch and anyone can feel free to implement one of them.
207 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
208 Expcnt = OpImm->getImm();
209 break;
210 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
211 Lgkmcnt = OpImm->getImm();
212 break;
213 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
214 Vmcnt = OpImm->getImm();
215 break;
216 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
217 Vscnt = OpImm->getImm();
218 break;
219 }
220 return;
221 }
222 case AMDGPU::S_WAITCNT_gfx10:
223 case AMDGPU::S_WAITCNT_gfx6_gfx7:
224 case AMDGPU::S_WAITCNT_vi:
225 unsigned WaitCnt = Inst.getOperand(0)->getImm();
226 AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
227 return;
228 }
229 }
230
generateWaitCntInfo()231 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
232 // The core logic from this function is taken from
233 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
234 // that are being looked at are in the MachineInstr format, whereas we have
235 // access to the MCInst format. The side effects of this are that we can't use
236 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
237 // functions. Therefore, we conservatively assume that these functions will
238 // return true. This may cause a few instructions to be incorrectly tagged
239 // with an extra CNT. However, these are instructions that do interact with at
240 // least one CNT so giving them an extra CNT shouldn't cause issues in most
241 // scenarios.
242 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
243 InstrWaitCntInfo.resize(SrcMgr.size());
244
245 for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
246 const std::unique_ptr<Instruction> &Inst = EN.value();
247 unsigned Index = EN.index();
248 unsigned Opcode = Inst->getOpcode();
249 const MCInstrDesc &MCID = MCII.get(Opcode);
250 if ((MCID.TSFlags & SIInstrFlags::DS) &&
251 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
252 InstrWaitCntInfo[Index].LgkmCnt = true;
253 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
254 InstrWaitCntInfo[Index].ExpCnt = true;
255 } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
256 // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
257 // and mayAccessLDSThroughFlat(Inst) would both return true for this
258 // instruction. We have to do this because those functions use
259 // information about the memory operands that we don't have access to.
260 InstrWaitCntInfo[Index].LgkmCnt = true;
261 if (!STI.hasFeature(AMDGPU::FeatureVscnt))
262 InstrWaitCntInfo[Index].VmCnt = true;
263 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
264 InstrWaitCntInfo[Index].VmCnt = true;
265 else
266 InstrWaitCntInfo[Index].VsCnt = true;
267 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
268 if (!STI.hasFeature(AMDGPU::FeatureVscnt))
269 InstrWaitCntInfo[Index].VmCnt = true;
270 else if ((MCID.mayLoad() &&
271 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
272 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
273 !MCID.mayStore()))
274 InstrWaitCntInfo[Index].VmCnt = true;
275 else if (MCID.mayStore())
276 InstrWaitCntInfo[Index].VsCnt = true;
277
278 // (IV.Major < 7) is meant to represent
279 // GCNTarget.vmemWriteNeedsExpWaitcnt()
280 // which is defined as
281 // { return getGeneration() < SEA_ISLANDS; }
282 if (IV.Major < 7 &&
283 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
284 InstrWaitCntInfo[Index].ExpCnt = true;
285 } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
286 InstrWaitCntInfo[Index].LgkmCnt = true;
287 } else if (MCID.TSFlags & SIInstrFlags::EXP) {
288 InstrWaitCntInfo[Index].ExpCnt = true;
289 } else {
290 switch (Opcode) {
291 case AMDGPU::S_SENDMSG:
292 case AMDGPU::S_SENDMSGHALT:
293 case AMDGPU::S_MEMTIME:
294 case AMDGPU::S_MEMREALTIME:
295 InstrWaitCntInfo[Index].LgkmCnt = true;
296 break;
297 }
298 }
299 }
300 }
301
302 // taken from SIInstrInfo::isVMEM()
isVMEM(const MCInstrDesc & MCID)303 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
304 return MCID.TSFlags & SIInstrFlags::MUBUF ||
305 MCID.TSFlags & SIInstrFlags::MTBUF ||
306 MCID.TSFlags & SIInstrFlags::MIMG;
307 }
308
309 // taken from SIInstrInfo::hasModifiersSet()
hasModifiersSet(const std::unique_ptr<Instruction> & Inst,unsigned OpName) const310 bool AMDGPUCustomBehaviour::hasModifiersSet(
311 const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
312 int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
313 if (Idx == -1)
314 return false;
315
316 const MCAOperand *Op = Inst->getOperand(Idx);
317 if (Op == nullptr || !Op->isImm() || !Op->getImm())
318 return false;
319
320 return true;
321 }
322
323 // taken from SIInstrInfo::isGWS()
isGWS(uint16_t Opcode) const324 bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
325 const MCInstrDesc &MCID = MCII.get(Opcode);
326 return MCID.TSFlags & SIInstrFlags::GWS;
327 }
328
329 // taken from SIInstrInfo::isAlwaysGDS()
isAlwaysGDS(uint16_t Opcode) const330 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
331 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
332 }
333
334 } // namespace llvm::mca
335
336 using namespace llvm;
337 using namespace mca;
338
339 static CustomBehaviour *
createAMDGPUCustomBehaviour(const MCSubtargetInfo & STI,const mca::SourceMgr & SrcMgr,const MCInstrInfo & MCII)340 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
341 const mca::SourceMgr &SrcMgr,
342 const MCInstrInfo &MCII) {
343 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
344 }
345
346 static InstrPostProcess *
createAMDGPUInstrPostProcess(const MCSubtargetInfo & STI,const MCInstrInfo & MCII)347 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
348 const MCInstrInfo &MCII) {
349 return new AMDGPUInstrPostProcess(STI, MCII);
350 }
351
352 /// Extern function to initialize the targets for the AMDGPU backend
353
LLVMInitializeAMDGPUTargetMCA()354 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
355 TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
356 createAMDGPUCustomBehaviour);
357 TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
358 createAMDGPUInstrPostProcess);
359
360 TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
361 createAMDGPUCustomBehaviour);
362 TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
363 createAMDGPUInstrPostProcess);
364 }
365