1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes if a function potentially memory bound and if a kernel 11 /// kernel may benefit from limiting number of waves to reduce cache thrashing. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUPerfHintAnalysis.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallSet.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/CallGraph.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/TargetLowering.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/CodeGen/TargetSubtargetInfo.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Target/TargetMachine.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "amdgpu-perf-hint" 32 33 static cl::opt<unsigned> 34 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 35 cl::desc("Function mem bound threshold in %")); 36 37 static cl::opt<unsigned> 38 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 39 cl::desc("Kernel limit wave threshold in %")); 40 41 static cl::opt<unsigned> 42 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 43 cl::desc("Indirect access memory instruction weight")); 44 45 static cl::opt<unsigned> 46 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 47 cl::desc("Large stride memory access weight")); 48 49 static cl::opt<unsigned> 50 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 51 cl::desc("Large stride memory access threshold")); 52 53 STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 54 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 55 56 char llvm::AMDGPUPerfHintAnalysis::ID = 0; 57 char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 58 59 INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 60 "Analysis if a function is memory bound", true, true) 61 62 namespace { 63 64 struct AMDGPUPerfHint { 65 friend AMDGPUPerfHintAnalysis; 66 67 public: 68 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 69 const TargetLowering *TLI_) 70 : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 71 72 bool runOnFunction(Function &F); 73 74 private: 75 struct MemAccessInfo { 76 const Value *V; 77 const Value *Base; 78 int64_t Offset; 79 MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} 80 bool isLargeStride(MemAccessInfo &Reference) const; 81 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 82 Printable print() const { 83 return Printable([this](raw_ostream &OS) { 84 OS << "Value: " << *V << '\n' 85 << "Base: " << *Base << " Offset: " << Offset << '\n'; 86 }); 87 } 88 #endif 89 }; 90 91 MemAccessInfo makeMemAccessInfo(Instruction *) const; 92 93 MemAccessInfo LastAccess; // Last memory access info 94 95 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 96 97 const DataLayout *DL; 98 99 const TargetLowering *TLI; 100 101 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); 102 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 103 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 104 105 bool isIndirectAccess(const Instruction *Inst) const; 106 107 /// Check if the instruction is large stride. 108 /// The purpose is to identify memory access pattern like: 109 /// x = a[i]; 110 /// y = a[i+1000]; 111 /// z = a[i+2000]; 112 /// In the above example, the second and third memory access will be marked 113 /// large stride memory access. 114 bool isLargeStride(const Instruction *Inst); 115 116 bool isGlobalAddr(const Value *V) const; 117 bool isLocalAddr(const Value *V) const; 118 bool isConstantAddr(const Value *V) const; 119 }; 120 121 static const Value *getMemoryInstrPtr(const Instruction *Inst) { 122 if (auto LI = dyn_cast<LoadInst>(Inst)) { 123 return LI->getPointerOperand(); 124 } 125 if (auto SI = dyn_cast<StoreInst>(Inst)) { 126 return SI->getPointerOperand(); 127 } 128 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { 129 return AI->getPointerOperand(); 130 } 131 if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { 132 return AI->getPointerOperand(); 133 } 134 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { 135 return MI->getRawDest(); 136 } 137 138 return nullptr; 139 } 140 141 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 142 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 143 SmallSet<const Value *, 32> WorkSet; 144 SmallSet<const Value *, 32> Visited; 145 if (const Value *MO = getMemoryInstrPtr(Inst)) { 146 if (isGlobalAddr(MO)) 147 WorkSet.insert(MO); 148 } 149 150 while (!WorkSet.empty()) { 151 const Value *V = *WorkSet.begin(); 152 WorkSet.erase(*WorkSet.begin()); 153 if (!Visited.insert(V).second) 154 continue; 155 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 156 157 if (auto LD = dyn_cast<LoadInst>(V)) { 158 auto M = LD->getPointerOperand(); 159 if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { 160 LLVM_DEBUG(dbgs() << " is IA\n"); 161 return true; 162 } 163 continue; 164 } 165 166 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 167 auto P = GEP->getPointerOperand(); 168 WorkSet.insert(P); 169 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 170 WorkSet.insert(GEP->getOperand(I)); 171 continue; 172 } 173 174 if (auto U = dyn_cast<UnaryInstruction>(V)) { 175 WorkSet.insert(U->getOperand(0)); 176 continue; 177 } 178 179 if (auto BO = dyn_cast<BinaryOperator>(V)) { 180 WorkSet.insert(BO->getOperand(0)); 181 WorkSet.insert(BO->getOperand(1)); 182 continue; 183 } 184 185 if (auto S = dyn_cast<SelectInst>(V)) { 186 WorkSet.insert(S->getFalseValue()); 187 WorkSet.insert(S->getTrueValue()); 188 continue; 189 } 190 191 if (auto E = dyn_cast<ExtractElementInst>(V)) { 192 WorkSet.insert(E->getVectorOperand()); 193 continue; 194 } 195 196 LLVM_DEBUG(dbgs() << " dropped\n"); 197 } 198 199 LLVM_DEBUG(dbgs() << " is not IA\n"); 200 return false; 201 } 202 203 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { 204 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; 205 206 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 207 208 for (auto &B : F) { 209 LastAccess = MemAccessInfo(); 210 for (auto &I : B) { 211 if (getMemoryInstrPtr(&I)) { 212 if (isIndirectAccess(&I)) 213 ++FI.IAMInstCount; 214 if (isLargeStride(&I)) 215 ++FI.LSMInstCount; 216 ++FI.MemInstCount; 217 ++FI.InstCount; 218 continue; 219 } 220 if (auto *CB = dyn_cast<CallBase>(&I)) { 221 Function *Callee = CB->getCalledFunction(); 222 if (!Callee || Callee->isDeclaration()) { 223 ++FI.InstCount; 224 continue; 225 } 226 if (&F == Callee) // Handle immediate recursion 227 continue; 228 229 auto Loc = FIM.find(Callee); 230 if (Loc == FIM.end()) 231 continue; 232 233 FI.MemInstCount += Loc->second.MemInstCount; 234 FI.InstCount += Loc->second.InstCount; 235 FI.IAMInstCount += Loc->second.IAMInstCount; 236 FI.LSMInstCount += Loc->second.LSMInstCount; 237 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 238 TargetLoweringBase::AddrMode AM; 239 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 240 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 241 AM.HasBaseReg = !AM.BaseGV; 242 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 243 GEP->getPointerAddressSpace())) 244 // Offset will likely be folded into load or store 245 continue; 246 ++FI.InstCount; 247 } else { 248 ++FI.InstCount; 249 } 250 } 251 } 252 253 return &FI; 254 } 255 256 bool AMDGPUPerfHint::runOnFunction(Function &F) { 257 const Module &M = *F.getParent(); 258 DL = &M.getDataLayout(); 259 260 if (F.hasFnAttribute("amdgpu-wave-limiter") && 261 F.hasFnAttribute("amdgpu-memory-bound")) 262 return false; 263 264 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); 265 266 LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount 267 << '\n' 268 << " IAMInst: " << Info->IAMInstCount << '\n' 269 << " LSMInst: " << Info->LSMInstCount << '\n' 270 << " TotalInst: " << Info->InstCount << '\n'); 271 272 if (isMemBound(*Info)) { 273 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 274 NumMemBound++; 275 F.addFnAttr("amdgpu-memory-bound", "true"); 276 } 277 278 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { 279 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 280 NumLimitWave++; 281 F.addFnAttr("amdgpu-wave-limiter", "true"); 282 } 283 284 return true; 285 } 286 287 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 288 return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; 289 } 290 291 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 292 return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + 293 FI.LSMInstCount * LSWeight) * 294 100 / FI.InstCount) > LimitWaveThresh; 295 } 296 297 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 298 if (auto PT = dyn_cast<PointerType>(V->getType())) { 299 unsigned As = PT->getAddressSpace(); 300 // Flat likely points to global too. 301 return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 302 } 303 return false; 304 } 305 306 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 307 if (auto PT = dyn_cast<PointerType>(V->getType())) 308 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 309 return false; 310 } 311 312 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 313 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 314 315 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 316 bool IsLargeStride = MAI.isLargeStride(LastAccess); 317 if (MAI.Base) 318 LastAccess = std::move(MAI); 319 320 return IsLargeStride; 321 } 322 323 AMDGPUPerfHint::MemAccessInfo 324 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 325 MemAccessInfo MAI; 326 const Value *MO = getMemoryInstrPtr(Inst); 327 328 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 329 // Do not treat local-addr memory access as large stride. 330 if (isLocalAddr(MO)) 331 return MAI; 332 333 MAI.V = MO; 334 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 335 return MAI; 336 } 337 338 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { 339 if (auto PT = dyn_cast<PointerType>(V->getType())) { 340 unsigned As = PT->getAddressSpace(); 341 return As == AMDGPUAS::CONSTANT_ADDRESS || 342 As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 343 } 344 return false; 345 } 346 347 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 348 MemAccessInfo &Reference) const { 349 350 if (!Base || !Reference.Base || Base != Reference.Base) 351 return false; 352 353 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 354 : Reference.Offset - Offset; 355 bool Result = Diff > LargeStrideThresh; 356 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 357 << print() << "<=>\n" 358 << Reference.print() << "Result:" << Result << '\n'); 359 return Result; 360 } 361 } // namespace 362 363 bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { 364 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 365 if (!TPC) 366 return false; 367 368 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 369 370 bool Changed = false; 371 for (CallGraphNode *I : SCC) { 372 Function *F = I->getFunction(); 373 if (!F || F->isDeclaration()) 374 continue; 375 376 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); 377 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 378 379 if (Analyzer.runOnFunction(*F)) 380 Changed = true; 381 } 382 383 return Changed; 384 } 385 386 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 387 auto FI = FIM.find(F); 388 if (FI == FIM.end()) 389 return false; 390 391 return AMDGPUPerfHint::isMemBound(FI->second); 392 } 393 394 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 395 auto FI = FIM.find(F); 396 if (FI == FIM.end()) 397 return false; 398 399 return AMDGPUPerfHint::needLimitWave(FI->second); 400 } 401