1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements methods from the AMDGPUCustomBehaviour class. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUCustomBehaviour.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "TargetInfo/AMDGPUTargetInfo.h" 18 #include "llvm/MC/TargetRegistry.h" 19 #include "llvm/Support/WithColor.h" 20 21 namespace llvm { 22 namespace mca { 23 24 void AMDGPUInstrPostProcess::postProcessInstruction( 25 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { 26 switch (MCI.getOpcode()) { 27 case AMDGPU::S_WAITCNT: 28 case AMDGPU::S_WAITCNT_soft: 29 case AMDGPU::S_WAITCNT_EXPCNT: 30 case AMDGPU::S_WAITCNT_LGKMCNT: 31 case AMDGPU::S_WAITCNT_VMCNT: 32 case AMDGPU::S_WAITCNT_VSCNT: 33 case AMDGPU::S_WAITCNT_VSCNT_soft: 34 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 35 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 36 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 37 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 38 case AMDGPU::S_WAITCNT_gfx10: 39 case AMDGPU::S_WAITCNT_gfx6_gfx7: 40 case AMDGPU::S_WAITCNT_vi: 41 return processWaitCnt(Inst, MCI); 42 } 43 } 44 45 // s_waitcnt instructions encode important information as immediate operands 46 // which are lost during the MCInst -> mca::Instruction lowering. 47 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, 48 const MCInst &MCI) { 49 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { 50 MCAOperand Op; 51 const MCOperand &MCOp = MCI.getOperand(Idx); 52 if (MCOp.isReg()) { 53 Op = MCAOperand::createReg(MCOp.getReg()); 54 } else if (MCOp.isImm()) { 55 Op = MCAOperand::createImm(MCOp.getImm()); 56 } 57 Op.setIndex(Idx); 58 Inst->addOperand(Op); 59 } 60 } 61 62 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, 63 const mca::SourceMgr &SrcMgr, 64 const MCInstrInfo &MCII) 65 : CustomBehaviour(STI, SrcMgr, MCII) { 66 generateWaitCntInfo(); 67 } 68 69 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, 70 const InstRef &IR) { 71 const Instruction &Inst = *IR.getInstruction(); 72 unsigned Opcode = Inst.getOpcode(); 73 74 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any 75 // pseudo instructions here. However, there are plans for the future to make 76 // it possible to use mca within backend passes. As such, I have left the 77 // pseudo version of s_waitcnt within this switch statement. 78 switch (Opcode) { 79 default: 80 return 0; 81 case AMDGPU::S_WAITCNT: // This instruction 82 case AMDGPU::S_WAITCNT_soft: 83 case AMDGPU::S_WAITCNT_EXPCNT: 84 case AMDGPU::S_WAITCNT_LGKMCNT: 85 case AMDGPU::S_WAITCNT_VMCNT: 86 case AMDGPU::S_WAITCNT_VSCNT: 87 case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo. 88 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 89 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 90 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 91 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 92 case AMDGPU::S_WAITCNT_gfx10: 93 case AMDGPU::S_WAITCNT_gfx6_gfx7: 94 case AMDGPU::S_WAITCNT_vi: 95 // s_endpgm also behaves as if there is an implicit 96 // s_waitcnt 0, but I'm not sure if it would be appropriate 97 // to model this in llvm-mca based on how the iterations work 98 // while simulating the pipeline over and over. 99 return handleWaitCnt(IssuedInst, IR); 100 } 101 102 return 0; 103 } 104 105 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, 106 const InstRef &IR) { 107 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. 108 // I do not know how that instruction works so I did not attempt to model it. 109 // set the max values to begin 110 unsigned Vmcnt = 63; 111 unsigned Expcnt = 7; 112 unsigned Lgkmcnt = 31; 113 unsigned Vscnt = 63; 114 unsigned CurrVmcnt = 0; 115 unsigned CurrExpcnt = 0; 116 unsigned CurrLgkmcnt = 0; 117 unsigned CurrVscnt = 0; 118 unsigned CyclesToWaitVm = ~0U; 119 unsigned CyclesToWaitExp = ~0U; 120 unsigned CyclesToWaitLgkm = ~0U; 121 unsigned CyclesToWaitVs = ~0U; 122 123 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); 124 125 // We will now look at each of the currently executing instructions 126 // to find out if this wait instruction still needs to wait. 127 for (const InstRef &PrevIR : IssuedInst) { 128 const Instruction &PrevInst = *PrevIR.getInstruction(); 129 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); 130 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; 131 const int CyclesLeft = PrevInst.getCyclesLeft(); 132 assert(CyclesLeft != UNKNOWN_CYCLES && 133 "We should know how many cycles are left for this instruction"); 134 if (PrevInstWaitInfo.VmCnt) { 135 CurrVmcnt++; 136 if ((unsigned)CyclesLeft < CyclesToWaitVm) 137 CyclesToWaitVm = CyclesLeft; 138 } 139 if (PrevInstWaitInfo.ExpCnt) { 140 CurrExpcnt++; 141 if ((unsigned)CyclesLeft < CyclesToWaitExp) 142 CyclesToWaitExp = CyclesLeft; 143 } 144 if (PrevInstWaitInfo.LgkmCnt) { 145 CurrLgkmcnt++; 146 if ((unsigned)CyclesLeft < CyclesToWaitLgkm) 147 CyclesToWaitLgkm = CyclesLeft; 148 } 149 if (PrevInstWaitInfo.VsCnt) { 150 CurrVscnt++; 151 if ((unsigned)CyclesLeft < CyclesToWaitVs) 152 CyclesToWaitVs = CyclesLeft; 153 } 154 } 155 156 unsigned CyclesToWait = ~0U; 157 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) 158 CyclesToWait = CyclesToWaitVm; 159 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) 160 CyclesToWait = CyclesToWaitExp; 161 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) 162 CyclesToWait = CyclesToWaitLgkm; 163 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) 164 CyclesToWait = CyclesToWaitVs; 165 166 // We may underestimate how many cycles we need to wait, but this 167 // isn't a big deal. Our return value is just how many cycles until 168 // this function gets run again. So as long as we don't overestimate 169 // the wait time, we'll still end up stalling at this instruction 170 // for the correct number of cycles. 171 172 if (CyclesToWait == ~0U) 173 return 0; 174 return CyclesToWait; 175 } 176 177 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, 178 unsigned &Expcnt, unsigned &Lgkmcnt, 179 unsigned &Vscnt) { 180 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); 181 const Instruction &Inst = *IR.getInstruction(); 182 unsigned Opcode = Inst.getOpcode(); 183 184 switch (Opcode) { 185 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 186 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 187 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 188 case AMDGPU::S_WAITCNT_VSCNT_gfx10: { 189 // Should probably be checking for nullptr 190 // here, but I'm not sure how I should handle the case 191 // where we see a nullptr. 192 const MCAOperand *OpReg = Inst.getOperand(0); 193 const MCAOperand *OpImm = Inst.getOperand(1); 194 assert(OpReg && OpReg->isReg() && "First operand should be a register."); 195 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); 196 if (OpReg->getReg() != AMDGPU::SGPR_NULL) { 197 // Instruction is using a real register. 198 // Since we can't know what value this register will have, 199 // we can't compute what the value of this wait should be. 200 WithColor::warning() << "The register component of " 201 << MCII.getName(Opcode) << " will be completely " 202 << "ignored. So the wait may not be accurate.\n"; 203 } 204 switch (Opcode) { 205 // Redundant switch so I don't have to repeat the code above 206 // for each case. There are more clever ways to avoid this 207 // extra switch and anyone can feel free to implement one of them. 208 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 209 Expcnt = OpImm->getImm(); 210 break; 211 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 212 Lgkmcnt = OpImm->getImm(); 213 break; 214 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 215 Vmcnt = OpImm->getImm(); 216 break; 217 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 218 Vscnt = OpImm->getImm(); 219 break; 220 } 221 return; 222 } 223 case AMDGPU::S_WAITCNT_gfx10: 224 case AMDGPU::S_WAITCNT_gfx6_gfx7: 225 case AMDGPU::S_WAITCNT_vi: 226 unsigned WaitCnt = Inst.getOperand(0)->getImm(); 227 AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); 228 return; 229 } 230 } 231 232 void AMDGPUCustomBehaviour::generateWaitCntInfo() { 233 // The core logic from this function is taken from 234 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions 235 // that are being looked at are in the MachineInstr format, whereas we have 236 // access to the MCInst format. The side effects of this are that we can't use 237 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) 238 // functions. Therefore, we conservatively assume that these functions will 239 // return true. This may cause a few instructions to be incorrectly tagged 240 // with an extra CNT. However, these are instructions that do interact with at 241 // least one CNT so giving them an extra CNT shouldn't cause issues in most 242 // scenarios. 243 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); 244 InstrWaitCntInfo.resize(SrcMgr.size()); 245 246 for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) { 247 const std::unique_ptr<Instruction> &Inst = EN.value(); 248 unsigned Index = EN.index(); 249 unsigned Opcode = Inst->getOpcode(); 250 const MCInstrDesc &MCID = MCII.get(Opcode); 251 if ((MCID.TSFlags & SIInstrFlags::DS) && 252 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { 253 InstrWaitCntInfo[Index].LgkmCnt = true; 254 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) 255 InstrWaitCntInfo[Index].ExpCnt = true; 256 } else if (MCID.TSFlags & SIInstrFlags::FLAT) { 257 // We conservatively assume that mayAccessVMEMThroughFlat(Inst) 258 // and mayAccessLDSThroughFlat(Inst) would both return true for this 259 // instruction. We have to do this because those functions use 260 // information about the memory operands that we don't have access to. 261 InstrWaitCntInfo[Index].LgkmCnt = true; 262 if (!STI.hasFeature(AMDGPU::FeatureVscnt)) 263 InstrWaitCntInfo[Index].VmCnt = true; 264 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) 265 InstrWaitCntInfo[Index].VmCnt = true; 266 else 267 InstrWaitCntInfo[Index].VsCnt = true; 268 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { 269 if (!STI.hasFeature(AMDGPU::FeatureVscnt)) 270 InstrWaitCntInfo[Index].VmCnt = true; 271 else if ((MCID.mayLoad() && 272 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || 273 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && 274 !MCID.mayStore())) 275 InstrWaitCntInfo[Index].VmCnt = true; 276 else if (MCID.mayStore()) 277 InstrWaitCntInfo[Index].VsCnt = true; 278 279 // (IV.Major < 7) is meant to represent 280 // GCNTarget.vmemWriteNeedsExpWaitcnt() 281 // which is defined as 282 // { return getGeneration() < SEA_ISLANDS; } 283 if (IV.Major < 7 && 284 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) 285 InstrWaitCntInfo[Index].ExpCnt = true; 286 } else if (MCID.TSFlags & SIInstrFlags::SMRD) { 287 InstrWaitCntInfo[Index].LgkmCnt = true; 288 } else if (MCID.TSFlags & SIInstrFlags::EXP) { 289 InstrWaitCntInfo[Index].ExpCnt = true; 290 } else { 291 switch (Opcode) { 292 case AMDGPU::S_SENDMSG: 293 case AMDGPU::S_SENDMSGHALT: 294 case AMDGPU::S_MEMTIME: 295 case AMDGPU::S_MEMREALTIME: 296 InstrWaitCntInfo[Index].LgkmCnt = true; 297 break; 298 } 299 } 300 } 301 } 302 303 // taken from SIInstrInfo::isVMEM() 304 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { 305 return MCID.TSFlags & SIInstrFlags::MUBUF || 306 MCID.TSFlags & SIInstrFlags::MTBUF || 307 MCID.TSFlags & SIInstrFlags::MIMG; 308 } 309 310 // taken from SIInstrInfo::hasModifiersSet() 311 bool AMDGPUCustomBehaviour::hasModifiersSet( 312 const std::unique_ptr<Instruction> &Inst, unsigned OpName) const { 313 int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); 314 if (Idx == -1) 315 return false; 316 317 const MCAOperand *Op = Inst->getOperand(Idx); 318 if (Op == nullptr || !Op->isImm() || !Op->getImm()) 319 return false; 320 321 return true; 322 } 323 324 // taken from SIInstrInfo::isGWS() 325 bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { 326 const MCInstrDesc &MCID = MCII.get(Opcode); 327 return MCID.TSFlags & SIInstrFlags::GWS; 328 } 329 330 // taken from SIInstrInfo::isAlwaysGDS() 331 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { 332 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); 333 } 334 335 } // namespace mca 336 } // namespace llvm 337 338 using namespace llvm; 339 using namespace mca; 340 341 static CustomBehaviour * 342 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, 343 const mca::SourceMgr &SrcMgr, 344 const MCInstrInfo &MCII) { 345 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); 346 } 347 348 static InstrPostProcess * 349 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, 350 const MCInstrInfo &MCII) { 351 return new AMDGPUInstrPostProcess(STI, MCII); 352 } 353 354 /// Extern function to initialize the targets for the AMDGPU backend 355 356 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { 357 TargetRegistry::RegisterCustomBehaviour(getTheR600Target(), 358 createAMDGPUCustomBehaviour); 359 TargetRegistry::RegisterInstrPostProcess(getTheR600Target(), 360 createAMDGPUInstrPostProcess); 361 362 TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(), 363 createAMDGPUCustomBehaviour); 364 TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(), 365 createAMDGPUInstrPostProcess); 366 } 367