1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements methods from the AMDGPUCustomBehaviour class. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUCustomBehaviour.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 #include "TargetInfo/AMDGPUTargetInfo.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/MC/TargetRegistry.h" 19 #include "llvm/Support/WithColor.h" 20 21 namespace llvm::mca { 22 23 void AMDGPUInstrPostProcess::postProcessInstruction( 24 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { 25 switch (MCI.getOpcode()) { 26 case AMDGPU::S_WAITCNT: 27 case AMDGPU::S_WAITCNT_soft: 28 case AMDGPU::S_WAITCNT_EXPCNT: 29 case AMDGPU::S_WAITCNT_LGKMCNT: 30 case AMDGPU::S_WAITCNT_VMCNT: 31 case AMDGPU::S_WAITCNT_VSCNT: 32 case AMDGPU::S_WAITCNT_VSCNT_soft: 33 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 34 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 35 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 36 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 37 case AMDGPU::S_WAITCNT_gfx10: 38 case AMDGPU::S_WAITCNT_gfx6_gfx7: 39 case AMDGPU::S_WAITCNT_vi: 40 return processWaitCnt(Inst, MCI); 41 } 42 } 43 44 // s_waitcnt instructions encode important information as immediate operands 45 // which are lost during the MCInst -> mca::Instruction lowering. 46 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, 47 const MCInst &MCI) { 48 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { 49 MCAOperand Op; 50 const MCOperand &MCOp = MCI.getOperand(Idx); 51 if (MCOp.isReg()) { 52 Op = MCAOperand::createReg(MCOp.getReg()); 53 } else if (MCOp.isImm()) { 54 Op = MCAOperand::createImm(MCOp.getImm()); 55 } 56 Op.setIndex(Idx); 57 Inst->addOperand(Op); 58 } 59 } 60 61 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, 62 const mca::SourceMgr &SrcMgr, 63 const MCInstrInfo &MCII) 64 : CustomBehaviour(STI, SrcMgr, MCII) { 65 generateWaitCntInfo(); 66 } 67 68 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, 69 const InstRef &IR) { 70 const Instruction &Inst = *IR.getInstruction(); 71 unsigned Opcode = Inst.getOpcode(); 72 73 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any 74 // pseudo instructions here. However, there are plans for the future to make 75 // it possible to use mca within backend passes. As such, I have left the 76 // pseudo version of s_waitcnt within this switch statement. 77 switch (Opcode) { 78 default: 79 return 0; 80 case AMDGPU::S_WAITCNT: // This instruction 81 case AMDGPU::S_WAITCNT_soft: 82 case AMDGPU::S_WAITCNT_EXPCNT: 83 case AMDGPU::S_WAITCNT_LGKMCNT: 84 case AMDGPU::S_WAITCNT_VMCNT: 85 case AMDGPU::S_WAITCNT_VSCNT: 86 case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo. 87 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 88 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 89 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 90 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 91 case AMDGPU::S_WAITCNT_gfx10: 92 case AMDGPU::S_WAITCNT_gfx6_gfx7: 93 case AMDGPU::S_WAITCNT_vi: 94 // s_endpgm also behaves as if there is an implicit 95 // s_waitcnt 0, but I'm not sure if it would be appropriate 96 // to model this in llvm-mca based on how the iterations work 97 // while simulating the pipeline over and over. 98 return handleWaitCnt(IssuedInst, IR); 99 } 100 101 return 0; 102 } 103 104 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, 105 const InstRef &IR) { 106 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. 107 // I do not know how that instruction works so I did not attempt to model it. 108 // set the max values to begin 109 unsigned Vmcnt = 63; 110 unsigned Expcnt = 7; 111 unsigned Lgkmcnt = 31; 112 unsigned Vscnt = 63; 113 unsigned CurrVmcnt = 0; 114 unsigned CurrExpcnt = 0; 115 unsigned CurrLgkmcnt = 0; 116 unsigned CurrVscnt = 0; 117 unsigned CyclesToWaitVm = ~0U; 118 unsigned CyclesToWaitExp = ~0U; 119 unsigned CyclesToWaitLgkm = ~0U; 120 unsigned CyclesToWaitVs = ~0U; 121 122 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); 123 124 // We will now look at each of the currently executing instructions 125 // to find out if this wait instruction still needs to wait. 126 for (const InstRef &PrevIR : IssuedInst) { 127 const Instruction &PrevInst = *PrevIR.getInstruction(); 128 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); 129 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; 130 const int CyclesLeft = PrevInst.getCyclesLeft(); 131 assert(CyclesLeft != UNKNOWN_CYCLES && 132 "We should know how many cycles are left for this instruction"); 133 if (PrevInstWaitInfo.VmCnt) { 134 CurrVmcnt++; 135 if ((unsigned)CyclesLeft < CyclesToWaitVm) 136 CyclesToWaitVm = CyclesLeft; 137 } 138 if (PrevInstWaitInfo.ExpCnt) { 139 CurrExpcnt++; 140 if ((unsigned)CyclesLeft < CyclesToWaitExp) 141 CyclesToWaitExp = CyclesLeft; 142 } 143 if (PrevInstWaitInfo.LgkmCnt) { 144 CurrLgkmcnt++; 145 if ((unsigned)CyclesLeft < CyclesToWaitLgkm) 146 CyclesToWaitLgkm = CyclesLeft; 147 } 148 if (PrevInstWaitInfo.VsCnt) { 149 CurrVscnt++; 150 if ((unsigned)CyclesLeft < CyclesToWaitVs) 151 CyclesToWaitVs = CyclesLeft; 152 } 153 } 154 155 unsigned CyclesToWait = ~0U; 156 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) 157 CyclesToWait = CyclesToWaitVm; 158 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) 159 CyclesToWait = CyclesToWaitExp; 160 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) 161 CyclesToWait = CyclesToWaitLgkm; 162 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) 163 CyclesToWait = CyclesToWaitVs; 164 165 // We may underestimate how many cycles we need to wait, but this 166 // isn't a big deal. Our return value is just how many cycles until 167 // this function gets run again. So as long as we don't overestimate 168 // the wait time, we'll still end up stalling at this instruction 169 // for the correct number of cycles. 170 171 if (CyclesToWait == ~0U) 172 return 0; 173 return CyclesToWait; 174 } 175 176 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, 177 unsigned &Expcnt, unsigned &Lgkmcnt, 178 unsigned &Vscnt) { 179 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); 180 const Instruction &Inst = *IR.getInstruction(); 181 unsigned Opcode = Inst.getOpcode(); 182 183 switch (Opcode) { 184 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 185 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 186 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 187 case AMDGPU::S_WAITCNT_VSCNT_gfx10: { 188 // Should probably be checking for nullptr 189 // here, but I'm not sure how I should handle the case 190 // where we see a nullptr. 191 const MCAOperand *OpReg = Inst.getOperand(0); 192 const MCAOperand *OpImm = Inst.getOperand(1); 193 assert(OpReg && OpReg->isReg() && "First operand should be a register."); 194 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); 195 if (OpReg->getReg() != AMDGPU::SGPR_NULL) { 196 // Instruction is using a real register. 197 // Since we can't know what value this register will have, 198 // we can't compute what the value of this wait should be. 199 WithColor::warning() << "The register component of " 200 << MCII.getName(Opcode) << " will be completely " 201 << "ignored. So the wait may not be accurate.\n"; 202 } 203 switch (Opcode) { 204 // Redundant switch so I don't have to repeat the code above 205 // for each case. There are more clever ways to avoid this 206 // extra switch and anyone can feel free to implement one of them. 207 case AMDGPU::S_WAITCNT_EXPCNT_gfx10: 208 Expcnt = OpImm->getImm(); 209 break; 210 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: 211 Lgkmcnt = OpImm->getImm(); 212 break; 213 case AMDGPU::S_WAITCNT_VMCNT_gfx10: 214 Vmcnt = OpImm->getImm(); 215 break; 216 case AMDGPU::S_WAITCNT_VSCNT_gfx10: 217 Vscnt = OpImm->getImm(); 218 break; 219 } 220 return; 221 } 222 case AMDGPU::S_WAITCNT_gfx10: 223 case AMDGPU::S_WAITCNT_gfx6_gfx7: 224 case AMDGPU::S_WAITCNT_vi: 225 unsigned WaitCnt = Inst.getOperand(0)->getImm(); 226 AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); 227 return; 228 } 229 } 230 231 void AMDGPUCustomBehaviour::generateWaitCntInfo() { 232 // The core logic from this function is taken from 233 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions 234 // that are being looked at are in the MachineInstr format, whereas we have 235 // access to the MCInst format. The side effects of this are that we can't use 236 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) 237 // functions. Therefore, we conservatively assume that these functions will 238 // return true. This may cause a few instructions to be incorrectly tagged 239 // with an extra CNT. However, these are instructions that do interact with at 240 // least one CNT so giving them an extra CNT shouldn't cause issues in most 241 // scenarios. 242 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); 243 InstrWaitCntInfo.resize(SrcMgr.size()); 244 245 for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) { 246 const std::unique_ptr<Instruction> &Inst = EN.value(); 247 unsigned Index = EN.index(); 248 unsigned Opcode = Inst->getOpcode(); 249 const MCInstrDesc &MCID = MCII.get(Opcode); 250 if ((MCID.TSFlags & SIInstrFlags::DS) && 251 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { 252 InstrWaitCntInfo[Index].LgkmCnt = true; 253 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) 254 InstrWaitCntInfo[Index].ExpCnt = true; 255 } else if (MCID.TSFlags & SIInstrFlags::FLAT) { 256 // We conservatively assume that mayAccessVMEMThroughFlat(Inst) 257 // and mayAccessLDSThroughFlat(Inst) would both return true for this 258 // instruction. We have to do this because those functions use 259 // information about the memory operands that we don't have access to. 260 InstrWaitCntInfo[Index].LgkmCnt = true; 261 if (!STI.hasFeature(AMDGPU::FeatureVscnt)) 262 InstrWaitCntInfo[Index].VmCnt = true; 263 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) 264 InstrWaitCntInfo[Index].VmCnt = true; 265 else 266 InstrWaitCntInfo[Index].VsCnt = true; 267 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { 268 if (!STI.hasFeature(AMDGPU::FeatureVscnt)) 269 InstrWaitCntInfo[Index].VmCnt = true; 270 else if ((MCID.mayLoad() && 271 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || 272 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && 273 !MCID.mayStore())) 274 InstrWaitCntInfo[Index].VmCnt = true; 275 else if (MCID.mayStore()) 276 InstrWaitCntInfo[Index].VsCnt = true; 277 278 // (IV.Major < 7) is meant to represent 279 // GCNTarget.vmemWriteNeedsExpWaitcnt() 280 // which is defined as 281 // { return getGeneration() < SEA_ISLANDS; } 282 if (IV.Major < 7 && 283 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) 284 InstrWaitCntInfo[Index].ExpCnt = true; 285 } else if (MCID.TSFlags & SIInstrFlags::SMRD) { 286 InstrWaitCntInfo[Index].LgkmCnt = true; 287 } else if (MCID.TSFlags & SIInstrFlags::EXP) { 288 InstrWaitCntInfo[Index].ExpCnt = true; 289 } else { 290 switch (Opcode) { 291 case AMDGPU::S_SENDMSG: 292 case AMDGPU::S_SENDMSGHALT: 293 case AMDGPU::S_MEMTIME: 294 case AMDGPU::S_MEMREALTIME: 295 InstrWaitCntInfo[Index].LgkmCnt = true; 296 break; 297 } 298 } 299 } 300 } 301 302 // taken from SIInstrInfo::isVMEM() 303 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { 304 return MCID.TSFlags & SIInstrFlags::MUBUF || 305 MCID.TSFlags & SIInstrFlags::MTBUF || 306 MCID.TSFlags & SIInstrFlags::MIMG; 307 } 308 309 // taken from SIInstrInfo::hasModifiersSet() 310 bool AMDGPUCustomBehaviour::hasModifiersSet( 311 const std::unique_ptr<Instruction> &Inst, unsigned OpName) const { 312 int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); 313 if (Idx == -1) 314 return false; 315 316 const MCAOperand *Op = Inst->getOperand(Idx); 317 if (Op == nullptr || !Op->isImm() || !Op->getImm()) 318 return false; 319 320 return true; 321 } 322 323 // taken from SIInstrInfo::isGWS() 324 bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { 325 const MCInstrDesc &MCID = MCII.get(Opcode); 326 return MCID.TSFlags & SIInstrFlags::GWS; 327 } 328 329 // taken from SIInstrInfo::isAlwaysGDS() 330 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { 331 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); 332 } 333 334 } // namespace llvm::mca 335 336 using namespace llvm; 337 using namespace mca; 338 339 static CustomBehaviour * 340 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, 341 const mca::SourceMgr &SrcMgr, 342 const MCInstrInfo &MCII) { 343 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); 344 } 345 346 static InstrPostProcess * 347 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, 348 const MCInstrInfo &MCII) { 349 return new AMDGPUInstrPostProcess(STI, MCII); 350 } 351 352 /// Extern function to initialize the targets for the AMDGPU backend 353 354 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { 355 TargetRegistry::RegisterCustomBehaviour(getTheR600Target(), 356 createAMDGPUCustomBehaviour); 357 TargetRegistry::RegisterInstrPostProcess(getTheR600Target(), 358 createAMDGPUInstrPostProcess); 359 360 TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(), 361 createAMDGPUCustomBehaviour); 362 TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(), 363 createAMDGPUInstrPostProcess); 364 } 365