1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements methods from the AMDGPUCustomBehaviour class. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUCustomBehaviour.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "SIInstrInfo.h" 17 #include "TargetInfo/AMDGPUTargetInfo.h" 18 #include "llvm/MC/TargetRegistry.h" 19 #include "llvm/Support/WithColor.h" 20 21 namespace llvm { 22 namespace mca { 23 24 void AMDGPUInstrPostProcess::postProcessInstruction( 25 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { 26 switch (MCI.getOpcode()) { 27 case AMDGPU::S_WAITCNT: 28 case AMDGPU::S_WAITCNT_EXPCNT: 29 case AMDGPU::S_WAITCNT_LGKMCNT: 30 case AMDGPU::S_WAITCNT_VMCNT: 31 case AMDGPU::S_WAITCNT_VSCNT: 32 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 33 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 34 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 35 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 36 case AMDGPU::S_WAITCNT_gfx10: 37 case AMDGPU::S_WAITCNT_gfx6_gfx7: 38 case AMDGPU::S_WAITCNT_vi: 39 return processWaitCnt(Inst, MCI); 40 } 41 } 42 43 // s_waitcnt instructions encode important information as immediate operands 44 // which are lost during the MCInst -> mca::Instruction lowering. 45 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, 46 const MCInst &MCI) { 47 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { 48 MCAOperand Op; 49 const MCOperand &MCOp = MCI.getOperand(Idx); 50 if (MCOp.isReg()) { 51 Op = MCAOperand::createReg(MCOp.getReg()); 52 } else if (MCOp.isImm()) { 53 Op = MCAOperand::createImm(MCOp.getImm()); 54 } 55 Op.setIndex(Idx); 56 Inst->addOperand(Op); 57 } 58 } 59 60 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, 61 const mca::SourceMgr &SrcMgr, 62 const MCInstrInfo &MCII) 63 : CustomBehaviour(STI, SrcMgr, MCII) { 64 generateWaitCntInfo(); 65 } 66 67 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, 68 const InstRef &IR) { 69 const Instruction &Inst = *IR.getInstruction(); 70 unsigned Opcode = Inst.getOpcode(); 71 72 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any 73 // pseudo instructions here. However, there are plans for the future to make 74 // it possible to use mca within backend passes. As such, I have left the 75 // pseudo version of s_waitcnt within this switch statement. 76 switch (Opcode) { 77 default: 78 return 0; 79 case AMDGPU::S_WAITCNT: // This instruction 80 case AMDGPU::S_WAITCNT_EXPCNT: 81 case AMDGPU::S_WAITCNT_LGKMCNT: 82 case AMDGPU::S_WAITCNT_VMCNT: 83 case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo. 84 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 85 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 86 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 87 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 88 case AMDGPU::S_WAITCNT_gfx10: 89 case AMDGPU::S_WAITCNT_gfx6_gfx7: 90 case AMDGPU::S_WAITCNT_vi: 91 // s_endpgm also behaves as if there is an implicit 92 // s_waitcnt 0, but I'm not sure if it would be appropriate 93 // to model this in llvm-mca based on how the iterations work 94 // while simulating the pipeline over and over. 95 return handleWaitCnt(IssuedInst, IR); 96 } 97 98 return 0; 99 } 100 101 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, 102 const InstRef &IR) { 103 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. 104 // I do not know how that instruction works so I did not attempt to model it. 105 // set the max values to begin 106 unsigned Vmcnt = 63; 107 unsigned Expcnt = 7; 108 unsigned Lgkmcnt = 31; 109 unsigned Vscnt = 63; 110 unsigned CurrVmcnt = 0; 111 unsigned CurrExpcnt = 0; 112 unsigned CurrLgkmcnt = 0; 113 unsigned CurrVscnt = 0; 114 unsigned CyclesToWaitVm = ~0U; 115 unsigned CyclesToWaitExp = ~0U; 116 unsigned CyclesToWaitLgkm = ~0U; 117 unsigned CyclesToWaitVs = ~0U; 118 119 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); 120 121 // We will now look at each of the currently executing instructions 122 // to find out if this wait instruction still needs to wait. 123 for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) { 124 const InstRef &PrevIR = *I; 125 const Instruction &PrevInst = *PrevIR.getInstruction(); 126 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); 127 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; 128 const int CyclesLeft = PrevInst.getCyclesLeft(); 129 assert(CyclesLeft != UNKNOWN_CYCLES && 130 "We should know how many cycles are left for this instruction"); 131 if (PrevInstWaitInfo.VmCnt) { 132 CurrVmcnt++; 133 if ((unsigned)CyclesLeft < CyclesToWaitVm) 134 CyclesToWaitVm = CyclesLeft; 135 } 136 if (PrevInstWaitInfo.ExpCnt) { 137 CurrExpcnt++; 138 if ((unsigned)CyclesLeft < CyclesToWaitExp) 139 CyclesToWaitExp = CyclesLeft; 140 } 141 if (PrevInstWaitInfo.LgkmCnt) { 142 CurrLgkmcnt++; 143 if ((unsigned)CyclesLeft < CyclesToWaitLgkm) 144 CyclesToWaitLgkm = CyclesLeft; 145 } 146 if (PrevInstWaitInfo.VsCnt) { 147 CurrVscnt++; 148 if ((unsigned)CyclesLeft < CyclesToWaitVs) 149 CyclesToWaitVs = CyclesLeft; 150 } 151 } 152 153 unsigned CyclesToWait = ~0U; 154 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) 155 CyclesToWait = CyclesToWaitVm; 156 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) 157 CyclesToWait = CyclesToWaitExp; 158 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) 159 CyclesToWait = CyclesToWaitLgkm; 160 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) 161 CyclesToWait = CyclesToWaitVs; 162 163 // We may underestimate how many cycles we need to wait, but this 164 // isn't a big deal. Our return value is just how many cycles until 165 // this function gets run again. So as long as we don't overestimate 166 // the wait time, we'll still end up stalling at this instruction 167 // for the correct number of cycles. 168 169 if (CyclesToWait == ~0U) 170 return 0; 171 return CyclesToWait; 172 } 173 174 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, 175 unsigned &Expcnt, unsigned &Lgkmcnt, 176 unsigned &Vscnt) { 177 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); 178 const Instruction &Inst = *IR.getInstruction(); 179 unsigned Opcode = Inst.getOpcode(); 180 181 switch (Opcode) { 182 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 183 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 184 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 185 case AMDGPU::S_WAITCNT_VSCNT_gfx10: { 186 // Should probably be checking for nullptr 187 // here, but I'm not sure how I should handle the case 188 // where we see a nullptr. 189 const MCAOperand *OpReg = Inst.getOperand(0); 190 const MCAOperand *OpImm = Inst.getOperand(1); 191 assert(OpReg && OpReg->isReg() && "First operand should be a register."); 192 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); 193 if (OpReg->getReg() != AMDGPU::SGPR_NULL) { 194 // Instruction is using a real register. 195 // Since we can't know what value this register will have, 196 // we can't compute what the value of this wait should be. 197 WithColor::warning() << "The register component of " 198 << MCII.getName(Opcode) << " will be completely " 199 << "ignored. So the wait may not be accurate.\n"; 200 } 201 switch (Opcode) { 202 // Redundant switch so I don't have to repeat the code above 203 // for each case. There are more clever ways to avoid this 204 // extra switch and anyone can feel free to implement one of them. 205 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 206 Expcnt = OpImm->getImm(); 207 break; 208 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 209 Lgkmcnt = OpImm->getImm(); 210 break; 211 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 212 Vmcnt = OpImm->getImm(); 213 break; 214 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 215 Vscnt = OpImm->getImm(); 216 break; 217 } 218 return; 219 } 220 case AMDGPU::S_WAITCNT_gfx10: 221 case AMDGPU::S_WAITCNT_gfx6_gfx7: 222 case AMDGPU::S_WAITCNT_vi: 223 unsigned WaitCnt = Inst.getOperand(0)->getImm(); 224 AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); 225 return; 226 } 227 } 228 229 void AMDGPUCustomBehaviour::generateWaitCntInfo() { 230 // The core logic from this function is taken from 231 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions 232 // that are being looked at are in the MachineInstr format, whereas we have 233 // access to the MCInst format. The side effects of this are that we can't use 234 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) 235 // functions. Therefore, we conservatively assume that these functions will 236 // return true. This may cause a few instructions to be incorrectly tagged 237 // with an extra CNT. However, these are instructions that do interact with at 238 // least one CNT so giving them an extra CNT shouldn't cause issues in most 239 // scenarios. 240 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); 241 InstrWaitCntInfo.resize(SrcMgr.size()); 242 243 int Index = 0; 244 for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) { 245 const std::unique_ptr<Instruction> &Inst = *I; 246 unsigned Opcode = Inst->getOpcode(); 247 const MCInstrDesc &MCID = MCII.get(Opcode); 248 if ((MCID.TSFlags & SIInstrFlags::DS) && 249 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { 250 InstrWaitCntInfo[Index].LgkmCnt = true; 251 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) 252 InstrWaitCntInfo[Index].ExpCnt = true; 253 } else if (MCID.TSFlags & SIInstrFlags::FLAT) { 254 // We conservatively assume that mayAccessVMEMThroughFlat(Inst) 255 // and mayAccessLDSThroughFlat(Inst) would both return true for this 256 // instruction. We have to do this because those functions use 257 // information about the memory operands that we don't have access to. 258 InstrWaitCntInfo[Index].LgkmCnt = true; 259 if (!STI.hasFeature(AMDGPU::FeatureVscnt)) 260 InstrWaitCntInfo[Index].VmCnt = true; 261 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) 262 InstrWaitCntInfo[Index].VmCnt = true; 263 else 264 InstrWaitCntInfo[Index].VsCnt = true; 265 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { 266 if (!STI.hasFeature(AMDGPU::FeatureVscnt)) 267 InstrWaitCntInfo[Index].VmCnt = true; 268 else if ((MCID.mayLoad() && 269 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || 270 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && 271 !MCID.mayStore())) 272 InstrWaitCntInfo[Index].VmCnt = true; 273 else if (MCID.mayStore()) 274 InstrWaitCntInfo[Index].VsCnt = true; 275 276 // (IV.Major < 7) is meant to represent 277 // GCNTarget.vmemWriteNeedsExpWaitcnt() 278 // which is defined as 279 // { return getGeneration() < SEA_ISLANDS; } 280 if (IV.Major < 7 && 281 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) 282 InstrWaitCntInfo[Index].ExpCnt = true; 283 } else if (MCID.TSFlags & SIInstrFlags::SMRD) { 284 InstrWaitCntInfo[Index].LgkmCnt = true; 285 } else if (MCID.TSFlags & SIInstrFlags::EXP) { 286 InstrWaitCntInfo[Index].ExpCnt = true; 287 } else { 288 switch (Opcode) { 289 case AMDGPU::S_SENDMSG: 290 case AMDGPU::S_SENDMSGHALT: 291 case AMDGPU::S_MEMTIME: 292 case AMDGPU::S_MEMREALTIME: 293 InstrWaitCntInfo[Index].LgkmCnt = true; 294 break; 295 } 296 } 297 } 298 } 299 300 // taken from SIInstrInfo::isVMEM() 301 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { 302 return MCID.TSFlags & SIInstrFlags::MUBUF || 303 MCID.TSFlags & SIInstrFlags::MTBUF || 304 MCID.TSFlags & SIInstrFlags::MIMG; 305 } 306 307 // taken from SIInstrInfo::hasModifiersSet() 308 bool AMDGPUCustomBehaviour::hasModifiersSet( 309 const std::unique_ptr<Instruction> &Inst, unsigned OpName) const { 310 int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); 311 if (Idx == -1) 312 return false; 313 314 const MCAOperand *Op = Inst->getOperand(Idx); 315 if (Op == nullptr || !Op->isImm() || !Op->getImm()) 316 return false; 317 318 return true; 319 } 320 321 // taken from SIInstrInfo::isAlwaysGDS() 322 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { 323 return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT || 324 Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR || 325 Opcode == AMDGPU::DS_GWS_SEMA_P || 326 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 327 Opcode == AMDGPU::DS_GWS_BARRIER; 328 } 329 330 } // namespace mca 331 } // namespace llvm 332 333 using namespace llvm; 334 using namespace mca; 335 336 static CustomBehaviour * 337 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, 338 const mca::SourceMgr &SrcMgr, 339 const MCInstrInfo &MCII) { 340 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); 341 } 342 343 static InstrPostProcess * 344 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, 345 const MCInstrInfo &MCII) { 346 return new AMDGPUInstrPostProcess(STI, MCII); 347 } 348 349 /// Extern function to initialize the targets for the AMDGPU backend 350 351 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { 352 TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(), 353 createAMDGPUCustomBehaviour); 354 TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(), 355 createAMDGPUInstrPostProcess); 356 357 TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(), 358 createAMDGPUCustomBehaviour); 359 TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(), 360 createAMDGPUInstrPostProcess); 361 } 362